# working on cleaning data


To remove all component details within the "Content" column in such a way that we only have text.
1. Drop rows that contain anything different text. (Pauses, Actions, Unintelligble Speech)
1. Remove punctuation marks from every row. (Removing ".", ",", "'", "?", etc.)
1. Always keep the corresponding "StTime" and "EnTime".
1. You can ommit/remove the "Spkr" "Line" or line columns.

In [1]:
import pandas as pd
import numpy as np
import torchaudio
import torch

In [4]:
# Initialize directory
transcript_dir = "/content/text"

transcript_df = pd.read_csv(transcript_dir+"/ATL_se0_ag1_f_01_1.txt", delimiter="\t")
transcript_df.head()

Unnamed: 0,Line,Spkr,StTime,Content,EnTime
0,1,ATL_se0_ag1_f_01,0.4436,"They talking about, don't send him to his daddy.",2.4068
1,2,ATL_se0_ag1_f_01,2.4068,(pause 0.28),2.6829
2,3,ATL_se0_ag1_f_01,2.6829,You just need to go file for child support.,4.9538
3,4,ATL_int_01,5.1142,[/Oh man/.],5.6125
4,5,ATL_se0_ag1_f_01,5.1488,[Bye.],5.5995


In [5]:
transcript_df

Unnamed: 0,Line,Spkr,StTime,Content,EnTime
0,1,ATL_se0_ag1_f_01,0.4436,"They talking about, don't send him to his daddy.",2.4068
1,2,ATL_se0_ag1_f_01,2.4068,(pause 0.28),2.6829
2,3,ATL_se0_ag1_f_01,2.6829,You just need to go file for child support.,4.9538
3,4,ATL_int_01,5.1142,[/Oh man/.],5.6125
4,5,ATL_se0_ag1_f_01,5.1488,[Bye.],5.5995
...,...,...,...,...,...
1228,1229,ATL_se0_ag1_f_01,1855.6029,[Wanna] play?,1856.1984
1229,1230,ATL_int_01,1855.8262,[Hm.],1856.1798
1230,1231,ATL_int_01,1856.3473,Yeah let me- let me see it.,1857.6250
1231,1232,ATL_se0_ag1_f_01,1858.3942,Your phone dead.,1859.3929


---

# cleaing df of unwanted data on a copy df

# droped rows with [], ect

In [6]:
import pandas as pd
import re

# # Load your original DataFrame
# transcript_dir = "../notebooks/coraal/transcript/text"
# transcript_df = pd.read_csv(transcript_dir + "/ATL_se0_ag1_f_01_1.txt", delimiter="\t")

# Create a copy of the original DataFrame to avoid corruption
cleaned_df = transcript_df.copy()

# Function to clean the "Content" column
def clean_content(content):
    # Remove anything within [ … ], / … /, and < … >
    content = re.sub(r'\[.*?\]|\<.*?\>|\/.*?\/', '', content)
    # Remove punctuation marks
    content = re.sub(r'[^\w\s]', '', content)
    # Strip leading and trailing whitespaces
    content = content.strip()
    return content

# Apply the cleaning function to the "Content" column
cleaned_df['CleanedContent'] = cleaned_df['Content'].apply(clean_content)

# Drop rows where "CleanedContent" is empty after cleaning
cleaned_df = cleaned_df[cleaned_df['CleanedContent'] != '']

# Keep only the "StTime", "EnTime", and "CleanedContent" columns
final_df = cleaned_df[['StTime', 'EnTime', 'CleanedContent']]

# Display the cleaned DataFrame
print(final_df.head())


   StTime  EnTime                                 CleanedContent
0  0.4436  2.4068  They talking about dont send him to his daddy
1  2.4068  2.6829                                      pause 028
2  2.6829  4.9538     You just need to go file for child support
5  6.0026  6.3944                                            Why
6  6.3944  7.1901                                      pause 080


# removed rows still containg pause

In [7]:
import pandas as pd
import re

# # Load your original DataFrame
# transcript_dir = "../notebooks/coraal/transcript/text"
# transcript_df = pd.read_csv(transcript_dir + "/ATL_se0_ag1_f_01_1.txt", delimiter="\t")

# # Create a copy of the original DataFrame to avoid corruption
# cleaned_df = transcript_df.copy()

# Function to clean the "Content" column
def clean_content(content):
    # Remove anything within [ … ], / … /, and < … >
    content = re.sub(r'\[.*?\]|\<.*?\>|\/.*?\/', '', content)
    # Remove punctuation marks
    content = re.sub(r'[^\w\s]', '', content)
    # Strip leading and trailing whitespaces
    content = content.strip()
    return content

# Apply the cleaning function to the "Content" column
cleaned_df['CleanedContent'] = cleaned_df['Content'].apply(clean_content)

# Drop rows where "CleanedContent" is empty after cleaning
cleaned_df = cleaned_df[cleaned_df['CleanedContent'] != '']

# Remove rows containing the word "pause" in "CleanedContent"
cleaned_df = cleaned_df[~cleaned_df['CleanedContent'].str.contains(r'\bpause\b', case=False, na=False)]

# Keep only the "StTime", "EnTime", and "CleanedContent" columns
final_df = cleaned_df[['StTime', 'EnTime', 'CleanedContent']]

# Display the cleaned DataFrame
print(final_df.head())


   StTime  EnTime                                 CleanedContent
0  0.4436  2.4068  They talking about dont send him to his daddy
2  2.6829  4.9538     You just need to go file for child support
5  6.0026  6.3944                                            Why
7  7.1901  7.5398                                            Why
8  8.6665  9.7375                           Okay whats your name


# fixing index misorder after row deletion

In [8]:
import pandas as pd
import re

# # Load your original DataFrame
# transcript_dir = "../notebooks/coraal/transcript/text"
# transcript_df = pd.read_csv(transcript_dir + "/ATL_se0_ag1_f_01_1.txt", delimiter="\t")

# # Create a copy of the original DataFrame to avoid corruption
# cleaned_df = transcript_df.copy()

# Function to clean the "Content" column
def clean_content(content):
    # Remove anything within [ … ], / … /, and < … >
    content = re.sub(r'\[.*?\]|\<.*?\>|\/.*?\/', '', content)
    # Remove punctuation marks
    content = re.sub(r'[^\w\s]', '', content)
    # Strip leading and trailing whitespaces
    content = content.strip()
    return content

# Apply the cleaning function to the "Content" column
cleaned_df['CleanedContent'] = cleaned_df['Content'].apply(clean_content)

# Drop rows where "CleanedContent" is empty after cleaning
cleaned_df = cleaned_df[cleaned_df['CleanedContent'] != '']

# Remove rows containing the word "pause" in "CleanedContent"
cleaned_df = cleaned_df[~cleaned_df['CleanedContent'].str.contains(r'\bpause\b', case=False, na=False)]

# Keep only the "StTime", "EnTime", and "CleanedContent" columns
final_df = cleaned_df[['StTime', 'EnTime', 'CleanedContent']]

# Reset the index of the cleaned DataFrame
final_df.reset_index(drop=True, inplace=True)

# Display the cleaned DataFrame
print(final_df.head())


   StTime  EnTime                                 CleanedContent
0  0.4436  2.4068  They talking about dont send him to his daddy
1  2.6829  4.9538     You just need to go file for child support
2  6.0026  6.3944                                            Why
3  7.1901  7.5398                                            Why
4  8.6665  9.7375                           Okay whats your name


# cleeaned df keeping puncuations and droping rows of Special symbols

In [9]:
import pandas as pd
import re

# # Load your original DataFrame
# transcript_dir = "../notebooks/coraal/transcript/text"
# transcript_df = pd.read_csv(transcript_dir + "/ATL_se0_ag1_f_01_1.txt", delimiter="\t")

# # Create a copy of the original DataFrame to avoid corruption
# cleaned_df = transcript_df.copy()

# Function to check for unwanted patterns in the "Content" column
def contains_unwanted_patterns(content):
    patterns = [r'\[.*?\]', r'\<.*?\>', r'\/.*?\/', r'\(.*?\)']
    for pattern in patterns:
        if re.search(pattern, content):
            return True
    return False

# Apply the function to filter rows with unwanted patterns
cleaned_df = cleaned_df[~cleaned_df['Content'].apply(contains_unwanted_patterns)]

# Remove rows where "Content" is empty after removing unwanted patterns
cleaned_df = cleaned_df[cleaned_df['Content'].str.strip() != '']

# Remove rows containing the word "pause" in "Content"
cleaned_df = cleaned_df[~cleaned_df['Content'].str.contains(r'\bpause\b', case=False, na=False)]

# Keep only the "StTime", "EnTime", and "Content" columns
final_df = cleaned_df[['StTime', 'EnTime', 'Content']]

# Reset the index of the cleaned DataFrame
final_df.reset_index(drop=True, inplace=True)

# Display the cleaned DataFrame
print(final_df.head())


   StTime  EnTime                                           Content
0  0.4436  2.4068  They talking about, don't send him to his daddy.
1  2.6829  4.9538       You just need to go file for child support.
2  6.0026  6.3944                                              Why?
3  7.1901  7.5398                                              Why?
4  8.6665  9.7375                           Okay, what's your name?


---

# checking df of clean vs not

In [None]:
# Print 'CleanedContent' column of DataFrame
final_df['CleanedContent']

0          They talking about dont send him to his daddy
1             You just need to go file for child support
2                                                    Why
3                                                    Why
4                                   Okay whats your name
                             ...                        
664                             the um Game Pigeon games
665                                                 play
666                            Yeah let me let me see it
667                                      Your phone dead
668    My phone all the way it And the charger right ...
Name: CleanedContent, Length: 669, dtype: object

In [None]:
# Print size of 'CleanedContent' column in DataFrame
final_df['CleanedContent'].size

669

In [None]:
# Print size of 'Content' column in DataFrame
transcript_df['Content'].size

1233

In [None]:
# Print 'Content' column of DataFrame
final_df['Content']

0       They talking about, don't send him to his daddy.
1            You just need to go file for child support.
2                                                   Why?
3                                                   Why?
4                                Okay, what's your name?
                             ...                        
488       Oh. What kind of games you play on your phone?
489                                                 Psh.
490                          Yeah let me- let me see it.
491                                     Your phone dead.
492    My phone all the way- it- And the charger righ...
Name: Content, Length: 493, dtype: object

# look at different samples for missed cleaned data

In [None]:
# Print 10 sample values of 'Content' column of DataFrame
final_df['CleanedContent'].sample(10)

329    Aint gonna lie I was like oh man I forgot how ...
104                                         whos younger
93                                     No thats for real
573                       Yeah thats all I could say too
66                                  I have four brothers
437                                               Im not
314    you can ak you cant ask them about that Like t...
195                                               So  um
450                                    Nah that was real
57                                              You know
Name: CleanedContent, dtype: object

In [None]:
# Print details of 'CleanedContent' column of DataFrame
final_df['CleanedContent'].describe()

count      669
unique     601
top       Okay
freq        14
Name: CleanedContent, dtype: object

In [None]:
# df dropping rows containing special symbols from documentation and keeping puncuations
# sampling to see if df was cleaned to desired parameters
final_df['Content'].sample(10)


352    you know. I give people the benefit of the dou...
152    it was good. It was a excellent school you kno...
191                      Ooh. Y'all had economics early?
433                                             Oh yeah?
20                                         Neighborhood.
289                August Alsina pretty good doing that.
322                                                Very.
358                                           Dang bruh.
144                  since you said something about, um,
101                                                 But-
Name: Content, dtype: object

In [None]:
# size of new df dropping rows with special symbols and keeping puncuations
final_df['Content'].size

493

In [None]:
# error not used now just tests
final_df[final_df['CleanedContent'] == []].sample(10)


ValueError: ('Lengths must match to compare', (1014,), (0,))

---

# to iterate through df

In [None]:
# Iterate through each row in our dataframe
for row in transcript_df.iterrows():
    print(row)
    break

(0, Line                                                      1
Spkr                                       ATL_se0_ag1_f_01
StTime                                               0.4436
Content    They talking about, don't send him to his daddy.
EnTime                                               2.4068
Name: 0, dtype: object)


In [None]:
# Iterates through columns
for data in transcript_df:
    print(data)
    break

Line


In [None]:
# Print tuples in DataFrame
for tuples in transcript_df.itertuples():
    display(tuples)
    break

Pandas(Index=0, Line=1, Spkr='ATL_se0_ag1_f_01', StTime=0.4436, Content="They talking about, don't send him to his daddy.", EnTime=2.4068)