In [136]:
#1. Data Import + Basic Cleaning + Username/Date Prep

In [137]:
#step 1 - import the pandas library to work with the csv dataset, numpy for numerical operations, 
#regular expressions for pattern matching, emoji for emoji-related features and Standard Scaler for feature scaling
import pandas as pd
import numpy as np
import re
import emoji
from sklearn.preprocessing import StandardScaler

#step 2 - load the csv file with tweets
df = pd.read_csv("Tweets.csv")

#step 3 - show the first 5 rows to preview the data
print(df.head())

   id                                               link  \
0   0  https://twitter.com/HackneyPSC/status/17274436...   
1   1  https://twitter.com/cherrysattitude/status/172...   
2   2  https://twitter.com/diamoundgirls2/status/1710...   
3   3  https://twitter.com/mmtchi/status/172764634165...   
4   4  https://twitter.com/NoahIeeNG/status/172744319...   

                                                text              date  likes  \
0  A statement from psychoanalytic activists:  Th...  11/22/2023 21:47      0   
1                        bak bak bak bak doyamadınız  11/22/2023 15:27    443   
2  Check out 🏒 35 + different ERIK KARLSSON cards...    10/7/2023 7:15      0   
3  Il s'en passe des trucs pendant qu'on vous ori...  11/23/2023 11:12    381   
4  AW OKAY.. WELL THATS COOL, IM SURE PAL WILL AP...  11/22/2023 21:45      0   

   comments  
0         0  
1         9  
2         0  
3        44  
4         0  


In [138]:
#step 4 - extract usernames from the tweet link
#the username is always the fourth part after the third "/"
def extract_username(link):
    try:
        return link.split("/")[3] #this gets the username
    except:
        return None #if the link is missing or broken, this returns nothing

df['username'] = df['link'].apply(extract_username)

In [139]:
#step 5 - convert the 'date' column to datetime format needed for timeline features 
df['date'] = pd.to_datetime(df['date'], errors='coerce') #errors='coerce' deals with bad data

#step 6 - drop empty rows
df.dropna(how='all', inplace=True)

In [140]:
#step 7 - print basic info to see how many columns and rows are present
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15498 entries, 0 to 15497
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        15498 non-null  int64         
 1   link      15498 non-null  object        
 2   text      15470 non-null  object        
 3   date      15498 non-null  datetime64[ns]
 4   likes     15498 non-null  int64         
 5   comments  15498 non-null  int64         
 6   username  15498 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 847.7+ KB
None


In [141]:
#2. Feature Extraction

In [142]:
#step 1 - count how many tweets each user posted
tweet_count = df['username'].value_counts().rename_axis('username').reset_index(name='tweet_count')

In [143]:
#step 2 - sort tweets by user and date to calculate the gap between tweets
df_sorted = df.sort_values(by=['username', 'date'])
df_sorted['time_difference'] = df_sorted.groupby('username')['date'].diff().dt.total_seconds()

In [144]:
#step 3 - calculolate average time between tweets for each user in  minutes
average_time_between =  df_sorted.groupby('username')['time_difference'].mean().fillna(0)
average_time_between_minutes = (average_time_between / 60).rename('average_time_between_tweets_minutes')

In [145]:
#step 4 - check how many tweets are written in 50% >= all caps
def is_majority_all_caps(text):
    try:
        words = text. split()
        if not words:
            return False
        return sum(words.isupper() for word in words) / len(words) > 0.5
    except:
        return False

percentage_all_caps = df.groupby('username')['text'].apply(
    lambda tweets: np.mean([is_majority_all_caps(t) for t in tweets])
).rename('percentage_of_tweets_with_all_caps')

In [146]:
#step 5 - count average number of emojis in each tweet
def count_emojis(text):
    if not isinstance(text, str):
        return 0 
    return sum (1 for c in text if c in emoji.EMOJI_DATA)

average_emoji_count = df.groupby('username')['text'].apply(
    lambda tweets: np.mean([count_emojis(t) for t in tweets])
).rename('average_emoji_count_per_tweet')

In [147]:
#step 6 - track emotionally intense language using a basic list of trigger words
emotional_words = ["attack", "hate", "furious", "angry", "kill", "rage", "disgust", "traitor", "destroy", "explode"]

def count_emotional_words(text):
    if not isinstance(text, str):
        return 0
    words = re.findall(r'\b\w+\b', text.lower())
    return sum(word in emotional_words for word in words)

average_emotion_words = df.groupby('username')['text'].apply(
    lambda tweets: np.mean([count_emotional_words(t) for t in tweets])
).rename('average_emotionally_charged_words_per_tweet')

In [148]:
#step 7 - calculate tweet length average per user
average_tweet_length = df.groupby('username')['text'].apply(
    lambda tweets: np.mean([len(str(t)) for t in tweets])
).rename('average_tweet_length')

In [149]:
#step 8 - count average number of hashtags per tweet 
def count_hashtags(text):
    return len(re.findall(r'#\w+', str(text)))

average_hashtags = df.groupby('username')['text'].apply(
    lambda tweets: np.mean([count_hashtags(t) for t in tweets])
).rename('average_hashtag_frequency_per_tweet')

In [150]:
#step 9 - set tweet count index to match others
tweet_count = tweet_count.set_index('username')

In [151]:
#step 10 - combine all features coded until now into one dataset
feature_matrix = pd.concat([
    tweet_count,
    average_time_between_minutes,
    percentage_all_caps,
    average_emoji_count,
    average_emotion_words,
    average_tweet_length,
    average_hashtags
], axis=1)

In [152]:
#step 11 - chech the final result
print(feature_matrix.head())

                 tweet_count  average_time_between_tweets_minutes  \
username                                                            
Novytique                 84                            45.156627   
salusalemchalom           48                             5.851064   
Kuwait_KW01               38                           102.054054   
diamoundgirls2            37                            27.305556   
AvivaKlompas              31                           958.433333   

                 percentage_of_tweets_with_all_caps  \
username                                              
Novytique                                       0.0   
salusalemchalom                                 0.0   
Kuwait_KW01                                     0.0   
diamoundgirls2                                  0.0   
AvivaKlompas                                    0.0   

                 average_emoji_count_per_tweet  \
username                                         
Novytique                     