In [148]:
import pandas as pd 
import warnings
import re

warnings.filterwarnings('ignore')  # Ignore all warnings

file_path = "twitter_data.csv"
final_df = pd.read_csv(file_path)
df = final_df

In [149]:
#Assigning columns

#Reset the index
df = df.reset_index(drop=True)

#Define new column names
new_columns = [
    'index',
    'user',
    'tweet',
    'VAR',
    'TwitterLink',
    'username'
]

#Update column names with str.strip() to remove whitespace
df.columns = [column.strip() for column in new_columns]

#Handle duplicate column names
df.columns = df.columns.unique()


In [150]:
#Remove the index column
df = df.drop(columns=['index'], axis=1)

#Remove the VAR column
df = df.drop(columns=['VAR'], axis=1)

In [151]:
#Convert the user and tweet columns to strings
df["user"] = df["user"].astype(str)

df["tweet"] = df["tweet"].astype(str)

In [152]:
#Cleaning first column (users)
df['user'] = df['user'].apply(lambda x: x.lstrip('"'))

#Define a function to handle usernames and remove characters after '-'
def process_username(text):
    #Extract username if it starts with '@'
    matches = re.findall(r'@(\w+)', text)
    if matches:
        return '@' + matches[0]

    #Remove characters after '-'
    if "-" in text:
        return re.sub(r"[^\w\s]", "", text)

    return text

#Apply the function to the "user" column
df["user"] = df["user"].apply(process_username)

#Define a function to remove characters after "on X:"
def remove_after_on_x(text):
    return text.split("on X:")[0]

#Apply the function to the "user" column
df["user"] = df["user"].apply(remove_after_on_x)

#Romove words after 3rd word
def remove_words_after_3rd(text):
    words = text.split()
    words = words[:3]
    return " ".join(words)

df['user'] = df['user'].apply(remove_words_after_3rd)

#Define a function to compare the first 4 words of "user" and "tweet" columns
def compare_and_replace(row):
    user_words = row["user"].split()[:3]
    tweet_words = row["tweet"].split()[:3]

    if user_words == tweet_words:
        return "unknown user"
    else:
        return row["user"]

# Apply the function to create a new "user" column with the changes
df["user"] = df.apply(compare_and_replace, axis=1)

#Deleting images
# Remove rows where the "user" column begins with "file:///var/mobile/Library/"
df = df[~df['user'].str.startswith("file:///var/mobile/Library/")]

#Remove 'Xcom' from every row in the "user" column
df['user'] = df['user'].str.replace('Xcom', '')


#Display only the first column
#print(df.iloc[:, 0])


In [153]:
#Cleaning the last row (username)

# Extract the desired part using a regular expression
df['username'] = df['username'].str.extract(r'https://twitter.com › (\S+) › status', expand=False)

# Display the updated DataFrame
#print(df['username'])


In [154]:
#df.columns
#print(df.head(10))
#print(df.iloc[1600, 0])
#print(len(df))


In [155]:
#Saving as csv file 
# Assuming your DataFrame is named df
df.to_csv('updated_twitter_data_after_cleaning_process.csv', index=False)

#checking_if_saved_file_is_empty = pd.read_csv("updated_twitter_data_after_cleaning_process.csv")
#print(checking_if_saved_file_is_empty.head(10))