# transciptes Cleaning

In [1]:
import pandas as pd
import re

class TextTransform:
    """Maps characters to integers and vice versa."""
    def __init__(self):
        char_map_str = """
        ' 0
        <SPACE> 1
        ا 2
        ب 3
        ت 4
        ث 5
        ج 6
        ح 7
        خ 8
        د 9
        ذ 10
        ر 11
        ز 12
        س 13
        ش 14
        ص 15
        ض 16
        ط 17
        ظ 18
        ع 19
        غ 20
        ف 21
        ق 22
        ك 23
        ل 24
        م 25
        ن 26
        ه 27
        و 28
        ي 29
        ء 30
        آ 31
        أ 32
        ؤ 33
        إ 34
        ئ 35
        ى 36
        """
        self.char_map = {}
        self.index_map = {}
        for line in char_map_str.strip().split('\n'):
            ch, index = line.split()
            self.char_map[ch] = int(index)
            self.index_map[int(index)] = ch
        self.index_map[1] = ' '  # Map <SPACE> to space character

    def text_to_int(self, text):
        """Converts text to an integer sequence using a character map."""
        int_sequence = []
        for c in text:
            if c == ' ':
                int_sequence.append(self.char_map['<SPACE>'])
            elif c in self.char_map:
                int_sequence.append(self.char_map[c])
            else:
                # Handle unknown characters (optional)
                int_sequence.append(0)  # Or another token for unknown characters
        return int_sequence

    def int_to_text(self, labels):
        """Converts integer labels to a text sequence using a character map."""
        return ''.join([self.index_map[i] for i in labels]).replace('<SPACE>', ' ')

def preprocess_arabic_transcripts(csv_file, output_file):
    text_transform = TextTransform()
    
    def find_non_arabic(text):
        non_arabic_pattern = re.compile(r'[^\u0600-\u06FF\s]')
        non_arabic_characters = ''.join(set(re.findall(non_arabic_pattern, text)))
        return non_arabic_characters
    
    # Load the CSV file with proper encoding for Arabic text
    transcripts_df = pd.read_csv(csv_file, encoding='utf-8')
    
    # Check for NaN values in transcript column and remove corresponding rows
    nan_rows = transcripts_df[pd.isna(transcripts_df['transcript'])].index.tolist()
    removed_rows = []  # To store indices of removed rows
    
    for idx in nan_rows:
        if idx < len(transcripts_df):
            removed_rows.append(idx)
            print(f"Removed row '{transcripts_df.iloc[idx]['audio']}' due to NaN transcript")
    
    # Drop rows with NaN transcripts
    transcripts_df.dropna(subset=['transcript'], inplace=True)
    
    # Re-index the DataFrame
    transcripts_df.reset_index(drop=True, inplace=True)
    for idx, row in transcripts_df.iterrows():
        transcript = row['transcript']    
        
        # Find non-Arabic characters (if needed)
        # non_arabic_characters = find_non_arabic(transcript)
        
        # Remove any unwanted characters not in the TextTransform
        cleaned_transcript = ''.join([c for c in transcript if c in text_transform.char_map or c == ' '])
        
        # Example of converting transcript to integer sequence
        int_sequence = text_transform.text_to_int(cleaned_transcript)
        
        # Replace original transcript with cleaned version
        transcripts_df.at[idx, 'transcript'] = cleaned_transcript
    
    # Save the final edited DataFrame to a new CSV file
    transcripts_df.to_csv(output_file, index=False, encoding='utf-8')
    
    print(f"Indices of rows with NaN transcripts: {removed_rows}")
    
    # # Print a sample of 5 encoded transcripts and decode them back
    # sample_transcripts = transcripts_df['transcript'][2:5]
    # for idx, transcript in enumerate(sample_transcripts):
    #     int_sequence = text_transform.text_to_int(transcript)
    #     decoded_text = text_transform.int_to_text(int_sequence)
    #     print(f"\nSample {idx + 1}:")
    #     print(f"Original Transcript: {transcript}")
    #     print(f"Encoded: {int_sequence}")
    #     print(f"Decoded: {decoded_text}")

    return transcripts_df

if __name__ == "__main__":
    csv_file = r"./data/a.csv"
    output_file = r"./data/aa.csv"
    cleaned_df = preprocess_arabic_transcripts(csv_file, output_file)

Removed row 'train_sample_1' due to NaN transcript
Removed row 'train_sample_5' due to NaN transcript
Removed row 'train_sample_7' due to NaN transcript
Indices of rows with NaN transcripts: [1, 5, 7]


In [2]:
cleaned_df.sample(5)

Unnamed: 0,audio,transcript
0,train_sample_0,على إنها عار في الوقت اللي كانت بتتعامل مع أخو...
6,train_sample_9,لكن الحاج فهمي كان هو البدايه إن البدايه إنك إ...
4,train_sample_6,إم دي اللي تقدر تفرق لاعيب من اللاعيب للاعيب ف...
2,train_sample_3,شيشكب شيشش ششش ؤشششش
3,train_sample_4,والله هي الموضوع مش كليب خلي بالك ولا أغنيه ال...


In [12]:
import pandas as pd
import re

class TextTransform:
    """Maps characters to integers and vice versa."""
    def __init__(self):
        char_map_str = """
        ' 0
        <SPACE> 1
        ا 2
        ب 3
        ت 4
        ث 5
        ج 6
        ح 7
        خ 8
        د 9
        ذ 10
        ر 11
        ز 12
        س 13
        ش 14
        ص 15
        ض 16
        ط 17
        ظ 18
        ع 19
        غ 20
        ف 21
        ق 22
        ك 23
        ل 24
        م 25
        ن 26
        ه 27
        و 28
        ي 29
        ء 30
        آ 31
        أ 32
        ؤ 33
        إ 34
        ئ 35
        ى 36
        """
        self.char_map = {}
        self.index_map = {}
        for line in char_map_str.strip().split('\n'):
            ch, index = line.split()
            self.char_map[ch] = int(index)
            self.index_map[int(index)] = ch
        self.index_map[1] = ' '  # Map <SPACE> to space character

    def text_to_int(self, text):
        """Converts text to an integer sequence using a character map."""
        int_sequence = []
        for c in text:
            if c == ' ':
                int_sequence.append(self.char_map['<SPACE>'])
            elif c in self.char_map:
                int_sequence.append(self.char_map[c])
            else:
                # Handle unknown characters (optional)
                int_sequence.append(0)  # Or another token for unknown characters
        return int_sequence

    def int_to_text(self, labels):
        """Converts integer labels to a text sequence using a character map."""
        return ''.join([self.index_map[i] for i in labels]).replace('<SPACE>', ' ')

def preprocess_arabic_transcripts(csv_file, output_file):
    text_transform = TextTransform()
    
    def find_non_arabic(text):
        non_arabic_pattern = re.compile(r'[^\u0600-\u06FF\s]')
        non_arabic_characters = ''.join(set(re.findall(non_arabic_pattern, text)))
        return non_arabic_characters

    def replace_characters_of_interest(text):
        replacements = {
            'ٱ': 'ا',
            'چ': 'ج',
            'ڨ': 'ف',
            'ة' :'ه'
        }
        for old_char, new_char in replacements.items():
            text = text.replace(old_char, new_char)
        return text
    
    # Load the CSV file with proper encoding for Arabic text
    transcripts_df = pd.read_csv(csv_file, encoding='utf-8')
    
    # Check for NaN values in transcript column and remove corresponding rows
    nan_rows = transcripts_df[pd.isna(transcripts_df['transcript'])].index.tolist()
    removed_rows = []  # To store indices of removed rows
    
    for idx in nan_rows:
        if idx < len(transcripts_df):
            removed_rows.append(idx)
            print(f"Removed row '{transcripts_df.iloc[idx]['audio']}' due to NaN transcript")
    
    # Drop rows with NaN transcripts
    transcripts_df.dropna(subset=['transcript'], inplace=True)
    
    # Re-index the DataFrame
    transcripts_df.reset_index(drop=True, inplace=True)
    
    for idx, row in transcripts_df.iterrows():
        transcript = row['transcript']
        
        # Replace characters of interest
        transcript = replace_characters_of_interest(transcript)
        
        # Find non-Arabic characters (if needed)
        # non_arabic_characters = find_non_arabic(transcript)
        
        # Remove any unwanted characters not in the TextTransform
        cleaned_transcript = ''.join([c for c in transcript if c in text_transform.char_map or c == ' '])
        
        # Example of converting transcript to integer sequence
        int_sequence = text_transform.text_to_int(cleaned_transcript)
        
        # Replace original transcript with cleaned version
        transcripts_df.at[idx, 'transcript'] = cleaned_transcript
    
    # Save the final edited DataFrame to a new CSV file
    transcripts_df.to_csv(output_file, index=False, encoding='utf-8')
    
    print(f"Indices of rows with NaN transcripts: {removed_rows}")
    
    # # Print a sample of 5 encoded transcripts and decode them back
    # sample_transcripts = transcripts_df['transcript'].head(5)
    # for idx, transcript in enumerate(sample_transcripts):
    #     int_sequence = text_transform.text_to_int(transcript)
    #     decoded_text = text_transform.int_to_text(int_sequence)
    #     print(f"\nSample {idx + 1}:")
    #     print(f"Original Transcript: {transcript}")
    #     print(f"Encoded: {int_sequence}")
    #     print(f"Decoded: {decoded_text}")

    return transcripts_df

if __name__ == "__main__":
    csv_file = r"./data/a.csv"
    output_file = r"./data/aa.csv"
    df=pd.read_csv(csv_file)
    cleaned_df = preprocess_arabic_transcripts(csv_file, output_file)

Removed row 'train_sample_1' due to NaN transcript
Removed row 'train_sample_5' due to NaN transcript
Removed row 'train_sample_7' due to NaN transcript
Indices of rows with NaN transcripts: [1, 5, 7]


In [14]:
df.head(5)

Unnamed: 0,audio,transcript
0,train_sample_0,على إنها عار في الوقت اللي كانت بتتعامل مع أخو...
1,train_sample_1,
2,train_sample_2,زي دول كتيره بنشوفها النهارده في العالم وأصبحت...
3,train_sample_3,ada شيش@dadكبة شچچچچي١١شڨڨ،،ٱش ش١شش ؤ99شش99ش99ش
4,train_sample_4,والله هي الموضوع مش كليب خلي بالك ولا أغنيه ال...


In [15]:
cleaned_df.head(5)

Unnamed: 0,audio,transcript
0,train_sample_0,على إنها عار في الوقت اللي كانت بتتعامل مع أخو...
1,train_sample_2,زي دول كتيره بنشوفها النهارده في العالم وأصبحت...
2,train_sample_3,شيشكبه شججججيشففاش ششش ؤشششش
3,train_sample_4,والله هي الموضوع مش كليب خلي بالك ولا أغنيه ال...
4,train_sample_6,إم دي اللي تقدر تفرق لاعيب من اللاعيب للاعيب ف...
