In [191]:
import pandas as pd 
#import warnings
import re
#import emoji
#warnings.filterwarnings('ignore')  # Ignore all warnings

file_path = "twitter_data.csv"
final_df = pd.read_csv(file_path)
df = final_df

In [192]:
#Assigning columns

#Reset the index
df = df.reset_index(drop=True)

#Define new column names
new_columns = [
    'index',
    'user',
    'tweet',
    'VAR',
    'twitterlink',
    'username'
]

#Update column names with str.strip() to remove whitespace
df.columns = [column.strip() for column in new_columns]

#Handle duplicate column names
df.columns = df.columns.unique()


In [193]:
#Remove the index column
df = df.drop(columns=['index'], axis=1)

#Remove the VAR column
df = df.drop(columns=['VAR'], axis=1)

In [194]:
#Convert the user, tweet and username columns to strings
df[['user', 'tweet', 'username']] = df[['user', 'tweet', 'username']].astype(str)

#print(df.dtypes)

In [195]:
#Cleaning first column (users)

# This method removes leading (left) double-quote characters from the string
df['user'] = df['user'].apply(lambda x: x.lstrip('"'))

#Define a function to handle usernames and remove characters after '-'
def process_username(text):
    #Extract username if it starts with '@'
    matches = re.findall(r'@(\w+)', text)
    if matches:
        return '@' + matches[0]

    #Remove characters after '-'
    if "-" in text:
        return re.sub(r"[^\w\s]", "", text)

    return text

#Apply the function to the "user" column
df["user"] = df["user"].apply(process_username)

#Define a function to remove characters after "on X:" including it
def remove_after_on_x(text):
    return text.split("on X:")[0]

#Apply the function to the "user" column
df["user"] = df["user"].apply(remove_after_on_x)

#Romove words after 3rd word
def remove_words_after_3rd(text):
    words = text.split()
    words = words[:3]
    return " ".join(words)

df['user'] = df['user'].apply(remove_words_after_3rd)

#Define a function to compare the first 4 words of "user" and "tweet" columns
def compare_and_replace(row):
    user_words = row["user"].split()[:3]
    tweet_words = row["tweet"].split()[:3]

    if user_words == tweet_words:
        return "unknown user"
    else:
        return row["user"]

# Apply the function to create a new "user" column with the changes
df["user"] = df.apply(compare_and_replace, axis=1)

#Deleting images
# Remove rows where the "user" column begins with "file:///var/mobile/Library/"
df = df[~df['user'].str.startswith("file:///var/mobile/Library/")]

#Remove 'Xcom' from every row in the "user" column
df['user'] = df['user'].str.replace('Xcom', '')


#Display only the first column
#print(df.iloc[:, 0])


In [196]:
#Renaming the user column name to 'name'
df = df.rename(columns={'user': 'name'})
#checking it
#print(df["name"])

In [197]:
#Cleaning the tweet column

# Replace "Embedded video" and everything after it with an empty string
df['tweet'] = df['tweet'].str.replace(r'Embedded video[^\n]*', '', regex=True)

# Remove everything after 'Reposts' (including 'Reposts')
df['tweet'] = df['tweet'].str.split('Reposts').str[0]

df['tweet'] = df['tweet'].str.replace(r'Â·[^\n]*', '', regex=True)

# Function to clean the tweet column
def clean_tweet(tweet):
    # Use Unicode escape sequence for '·' to handle different encodings
    cleaned_tweet = tweet.split('\u00b7')[0]
    return cleaned_tweet.strip()

# Apply the clean_tweet function to the 'tweet' column
df['tweet'] = df['tweet'].apply(clean_tweet)

# Function to remove all emojis from a tweet
def remove_emojis(tweet):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', tweet)

# Apply the remove_emojis function to the 'tweet' column
df['tweet'] = df['tweet'].apply(remove_emojis)

# Function to remove the specified pattern
def remove_pattern(tweet):
    pattern = re.compile(r'Born [A-Za-z]+ \d+ Joined [A-Za-z]+ \d{4}\. \d+ Following')
    return pattern.sub('', tweet)

# Apply the remove_pattern function to the 'tweet' column
df['tweet'] = df['tweet'].apply(remove_pattern)

# Function to remove the specified pattern
def remove_pattern(tweet):
    pattern = re.compile(r'Joined [A-Za-z]+ \d{4}\. \d+ Following')
    return pattern.sub('', tweet)

# Apply the remove_pattern function to the 'tweet' column
df['tweet'] = df['tweet'].apply(remove_pattern)

# Function to remove the specified pattern
def remove_pattern(tweet):
    pattern = re.compile(r'Quote\. Square profile picture ,Discord: .*? ,linktr\.ee/[^ ]* Joined [A-Za-z]+ \d{4}\. \d+:\d+ [APMapm]+ ,')
    return pattern.sub('', tweet)

# Apply the remove_pattern function to the 'tweet' column
df['tweet'] = df['tweet'].apply(remove_pattern)

# Function to remove datetime formats
def remove_datetime_formats(tweet):
    # Match common datetime formats
    datetime_pattern = re.compile(r'\b\d{1,2}[:]\d{1,2}([:\d{1,2}]*)?[ ]?(?:AM|PM)?\b|'
                                  r'\b\d{1,2}[/]\d{1,2}[/]\d{2,4}\b|'
                                  r'\b(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-zA-Z]*,? [A-Za-z]+ \d{1,2},? \d{2,4}\b|'
                                  r'\b\d{1,2}[/]\d{1,2}[/]\d{2,4} \d{1,2}[:]\d{1,2}([:\d{1,2}]*)?[ ]?(?:AM|PM)?\b')

    return datetime_pattern.sub('', tweet)

# Apply the remove_datetime_formats function to the 'tweet' column
df['tweet'] = df['tweet'].apply(remove_datetime_formats)

In [198]:
#Cleaning the last row (username)

# Extract the desired part using a regular expression
df['username'] = df['username'].str.extract(r'https://twitter.com › (\S+) › status', expand=False)

# Check and fill missing values in the "username" column
df['username'] = df.apply(lambda row: row['twitterlink'].replace('https://twitter.com/', '').
                          split('/status/')[0] if pd.isnull(row['username']) else row['username'], axis=1)

# Clean the "username" column
df['username'] = df['username'].str.replace('https://twitter.com/', '').str.replace('/status/.*', '')

# Display the updated DataFrame
#print(df['username'])


In [199]:
#df.columns
#print(df.head(10))
#print(df.iloc[1600, 0])
#print(len(df))


In [200]:
#Saving as csv file 
# Assuming your DataFrame is named df
df.to_csv('cleaned_dataset.csv', index=False)

#checking_if_saved_file_is_empty = pd.read_csv("updated_twitter_data_after_cleaning_process.csv")
#print(checking_if_saved_file_is_empty.head(10))