In [None]:
import re
import pandas as pd
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
import math
from collections import Counter
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('names')
nltk.download('vader_lexicon')

In [None]:
df_with_duplicates = pd.read_csv('/content/sample_data/Twitter_data.csv')
df = df_with_duplicates.drop_duplicates(['twitter_screen_name'])
print(f"Length of df_with_duplicates: {len(df_with_duplicates)}")
print(f"Length of df: {len(df)}")

Length of df_with_duplicates: 29063
Length of df: 26585


# Functions

In [None]:
def filter_out_non_english_stop_lemma_words(text, lemmatize):
    # tokenize the text into words
    words = word_tokenize(text)

    # filter out non-English and stop  words
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    filtered_words = [word.lower() for word in words if word.lower() in english_vocab]
    stopwords_set = set(stopwords.words('english'))
    final_words = [word for word in filtered_words if word not in stopwords_set]

    # lemmatization based on true/false flag
    if lemmatize:
      lemmatizer = WordNetLemmatizer()
      final_words = [lemmatizer.lemmatize(word) for word in final_words]

    return final_words


def calculate_word_frequencies(text, top_n=30):
    # clean non-english, stop words and lemmatize
    words = filter_out_non_english_stop_lemma_words(text, True)

    # calc word frequencies using FreqDist
    freq_dist = FreqDist(words)
    top_n_words = freq_dist.most_common(top_n)

    return top_n_words


def identify_gender_by_first_name(first_name):
    # remove non-alphabetic characters
    first_name_lower = re.sub(r'[^a-zA-Z]', '', first_name.lower())
    if first_name_lower in male_names and first_name_lower not in female_names:
        return 'Male'
    elif first_name_lower in female_names and first_name_lower not in male_names:
        return 'Female'
    else:
        return 'Unknown'

def calculate_pmi(text, pr_class, total_words):
    words = filter_out_non_english_stop_lemma_words(text, False)

    # calc word frequencies
    word_freq = Counter(words)

    # calc PMI for each word
    pmi_words = []
    for word, freq in word_freq.items():
        pr_word_class = freq / total_words
        pr_word = len([1 for doc in new_df['twitter_desc'] if word in doc.lower()]) / total_words

        # avoid zeroes before division
        if pr_word != 0:
            pmi = math.log2(pr_word_class / (pr_word * pr_class))
            pmi_words.append((word, pmi))

    # sort by PMI in desc order
    pmi_words.sort(key=lambda x: x[1], reverse=True)

    return pmi_words[:30]

def calculate_pmi_for_type(df, type_column, type_value):
    type_df = df[df[type_column] == type_value]
    non_type_df = df[df[type_column] != type_value]

    # concatenate the Twitter descriptions for each group
    type_desc = ' '.join(type_df['twitter_desc'].dropna())
    non_type_desc = ' '.join(non_type_df['twitter_desc'].dropna())

    # calculate total words for PMI calculation
    total_type_words = len(word_tokenize(type_desc))
    total_non_type_words = len(word_tokenize(non_type_desc))

    # calculate PMI for type users
    pr_type = type_df.shape[0] / df.shape[0]
    type_pmi_words = calculate_pmi(type_desc, pr_type, total_type_words)

    # calculate PMI for non-type users
    pr_non_type = non_type_df.shape[0] / df.shape[0]
    non_type_pmi_words = calculate_pmi(non_type_desc, pr_non_type, total_non_type_words)

    return type_pmi_words, non_type_pmi_words




# 1. Top 30 most frequent words in 'twitter_desc' column

In [None]:
twitter_desc = ' '.join(df['twitter_desc'].dropna())
top_words = calculate_word_frequencies(twitter_desc)

print("Top 30 most frequent words in 'twitter_desc' column:")
for word, frequency in top_words:
    print(f"{word}: {frequency}")

Top 30 most frequent words in 'twitter_desc' column:
official: 2741
twitter: 2344
news: 1784
new: 1423
world: 1384
account: 1343
author: 1143
u: 966
de: 826
follow: 731
former: 669
actor: 661
music: 649
time: 595
writer: 570
life: 554
host: 552
la: 530
love: 529
sport: 516
page: 511
people: 506
contact: 488
father: 488
husband: 486
since: 471
business: 464
album: 457
champion: 452
best: 448


# 2. Words associated with gender

In [None]:
# female, male name lexicons
names = nltk.corpus.names
male_names = set(name.lower() for name in names.words('male.txt'))
female_names = set(name.lower() for name in names.words('female.txt'))

# copy of the df for Person type (filter out org, work and so on)
df_person = df[df['type2'] == 'Person'].copy()
df_person['gender'] = df_person['twitter_name'].apply(lambda x: identify_gender_by_first_name(x.split()[0]) if pd.notnull(x) else 'Unknown')

# create a new df with selected columns, remove NaN values from 'twitter_desc'
new_df = df_person[['twitter_name', 'gender', 'twitter_desc']].copy()
new_df['twitter_desc'].fillna('', inplace=True)
new_df = new_df[new_df['gender'] != 'Unknown']

#new_df.to_csv('/content/sample_data/Twitter_data_with_gender.csv', index=False)

male_count = new_df[new_df['gender'] == 'Male'].shape[0]
female_count = new_df[new_df['gender'] == 'Female'].shape[0]

print(f"Number of Male users: {male_count}")
print(f"Number of Female users: {female_count}")

word_freq_by_gender = new_df.groupby('gender')['twitter_desc'].apply(lambda x: calculate_word_frequencies(' '.join(x)))

# results
for gender, top_words in word_freq_by_gender.items():
    print(f"\nTop 30 most frequent words for {gender} users:")
    for word, frequency in top_words:
        print(f"{word}: {frequency}")

Number of Male users: 5895
Number of Female users: 3356

Top 30 most frequent words for Female users:
author: 322
actress: 248
new: 195
writer: 159
wife: 142
official: 137
host: 126
world: 126
former: 123
lover: 119
twitter: 119
actor: 117
mother: 112
time: 112
de: 102
love: 99
book: 94
life: 90
founder: 83
la: 81
director: 80
singer: 79
girl: 77
news: 75
producer: 72
champion: 70
speaker: 66
contact: 61
correspondent: 61
music: 59

Top 30 most frequent words for Male users:
author: 450
official: 400
twitter: 349
new: 346
husband: 293
father: 286
actor: 278
former: 278
host: 210
de: 197
writer: 196
world: 191
time: 177
account: 166
player: 159
champion: 157
director: 152
dad: 143
founder: 142
life: 131
music: 126
book: 126
love: 122
producer: 115
professional: 114
page: 110
editor: 108
la: 104
contact: 100
business: 96


Results:
- 'author' is the most frequent word in both groups, what might be influensed by self-promoting, professional career and other factors
- in both groups words associated with personal and familial roles are at the top of the list (presence of societal biases and gender stereotypes)
- both groups also mention words related to professional activities, like writing, producing, or engaging with media



# Words associated with gender using PMI

In [None]:
total_words = len(word_tokenize(' '.join(new_df['twitter_desc'])))

# calculate PMI for female users
pr_female = female_count / (male_count + female_count)
female_pmi_words = calculate_pmi(' '.join(new_df[new_df['gender'] == 'Female']['twitter_desc']), pr_female, total_words)

# calculate PMI for male users
pr_male = male_count / (male_count + female_count)
male_pmi_words = calculate_pmi(' '.join(new_df[new_df['gender'] == 'Male']['twitter_desc']), pr_male, total_words)

print("\nTop 30 most frequent words with highest PMI for Female users:")
for word, pmi in female_pmi_words:
    print(f"{word}: {pmi}")

print("\nTop 30 most frequent words with highest PMI for Male users:")
for word, pmi in male_pmi_words:
    print(f"{word}: {pmi}")



Top 30 most frequent words with highest PMI for Female users:
blah: 2.7847947033399807
geordie: 2.462866608452618
mutton: 2.462866608452618
usun: 2.462866608452618
winehouse: 2.462866608452618
flea: 2.462866608452618
optimism: 2.462866608452618
impressionist: 2.462866608452618
lunar: 2.462866608452618
tongue: 2.462866608452618
shetland: 2.462866608452618
patsy: 2.462866608452618
disability: 2.462866608452618
bittersweet: 2.0478291091737746
latex: 2.0478291091737746
stardom: 2.0478291091737746
decision: 2.0478291091737746
algorithmic: 2.0478291091737746
ballet: 1.7847947033399805
meditation: 1.7847947033399805
administration: 1.7259010142864122
eyewitness: 1.4628666084526183
trucker: 1.462866608452618
spectacular: 1.462866608452618
beautifully: 1.462866608452618
pursuer: 1.462866608452618
melanin: 1.462866608452618
weirdness: 1.462866608452618
unprofessionally: 1.462866608452618
eyebrow: 1.462866608452618

Top 30 most frequent words with highest PMI for Male users:
wheeled: 2.235080011

Results:

- we've got a totally different results with PMI that without as in previous frequency count. We saw that if rare word occurs in the context of a specific class (for example 'blah' among female users and 'wheeled' among male class), and this co-occurrence is much higher than expected by chance, the PMI score for that word in specific class will be high.
- for the female users, some of the words with high PMI values include 'blah', 'geordie', 'mutton', 'winehouse', 'optimism', 'impressionist', and 'lunar'.
- similarly, for the male users, words like 'wheeled', 'honk', 'bermuda', 'batch', 'luigi', and 'canty' have high PMI values, strongly associated with male users.
- these results indicate the presence of certain language patterns or topics that are more prevalent or distinctive among male and female users based on our dataset.

# 3. MusicalArtist Politician most frequent PMI associated words

In [None]:
# calc PMI for MusicalArtist users
musical_artist_pmi_words, non_musical_artist_pmi_words = calculate_pmi_for_type(df, 'type1', 'MusicalArtist')

print("\nTop 30 most frequent words with highest PMI for MusicalArtist users:")
for word, pmi in musical_artist_pmi_words:
    print(f"{word}: {pmi}")

print("\nTop 30 most frequent words with highest PMI for Non-MusicalArtist users:")
for word, pmi in non_musical_artist_pmi_words:
    print(f"{word}: {pmi}")


Top 30 most frequent words with highest PMI for MusicalArtist users:
electronic: 5.276680751921932
melody: 5.276680751921932
nuclear: 4.954752657034569
blast: 4.954752657034569
donnie: 4.954752657034569
anniversary: 4.762107579092173
royalty: 4.539715157755725
bluegrass: 4.539715157755725
sonic: 4.539715157755725
vibrant: 4.539715157755725
vinyl: 4.276680751921932
dove: 4.276680751921932
stellar: 4.276680751921932
acoustic: 4.177145078371017
debut: 3.954752657034569
junior: 3.954752657034569
daydreamer: 3.954752657034569
label: 3.954752657034569
outrageous: 3.954752657034569
jorge: 3.954752657034569
honk: 3.954752657034569
slang: 3.954752657034569
steady: 3.954752657034569
empire: 3.954752657034569
trivium: 3.954752657034569
revolt: 3.954752657034569
flamboyant: 3.954752657034569
hairline: 3.954752657034569
veil: 3.954752657034569
winehouse: 3.954752657034569

Top 30 most frequent words with highest PMI for Non-MusicalArtist users:
affiliate: 6.967410729518251
privacy: 5.5211544996286

Results:
- the results are logical based on the context of the user groups being analyzed
- for MusicalArtist users words like "electronic", "melody", "acoustic", "royalty" are highly relevant to the music industry, indicating a strong association with MusicalArtist users
- for Non-MusicalArtist users words such as "affiliate", "privacy", "analysis", "customer" are more relevant to business, technology, and organizational topics.
- overall results align with the expected differences in the topics and interests between MusicalArtist and Non-MusicalArtist users on Twitter

In [None]:
# calculate PMI for Politician users
politician_pmi_words, non_politician_pmi_words = calculate_pmi_for_type(df, 'type1', 'Politician')

print("\nTop 30 most frequent words with highest PMI for Politician users:")
for word, pmi in politician_pmi_words:
    print(f"{word}: {pmi}")

print("\nTop 30 most frequent words with highest PMI for Non-Politician users:")
for word, pmi in non_politician_pmi_words:
    print(f"{word}: {pmi}")


Top 30 most frequent words with highest PMI for Politician users:
pueblo: 6.5159304925656745
hampshire: 6.5159304925656745
congressional: 6.265952239557327
administration: 6.194002397678312
constituency: 6.1008929932868305
opposition: 6.1008929932868305
estado: 6.1008929932868305
agriculture: 6.1008929932868305
assistance: 6.1008929932868305
hiker: 6.1008929932868305
whip: 5.930967991844518
district: 5.8944421158194045
serving: 5.882712823237298
proudly: 5.869567447180375
vermont: 5.837858587453037
subcommittee: 5.778964898399468
minister: 5.76831765419996
pennsylvania: 5.738322913902122
senator: 5.72614819995602
congressman: 5.6154661661165886
governor: 5.531527347616693
turkey: 5.5159304925656745
secretary: 5.5159304925656745
counselor: 5.5159304925656745
infrastructure: 5.5159304925656745
islamic: 5.5159304925656745
kansan: 5.5159304925656745
chairwoman: 5.5159304925656745
usun: 5.5159304925656745
tammy: 5.5159304925656745

Top 30 most frequent words with highest PMI for Non-Politi

Results
- results seems reasonable, sinse most frequent words  are often related to political contexts. Words like "pueblo," "hampshire," "congressional," "administration," and "constituency" got a high PMI values, indicating a strong association with the Politician users in the dataset.

# 4. Sentiment analysis

In [None]:
# Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# replace NaN values in twitter_desc
df_copy = df.copy()
df_copy['twitter_desc'].fillna('', inplace=True)

# calculate sentiment scores for each twitter_desc
df_copy['sentiment_score'] = df_copy['twitter_desc'].apply(lambda x: sia.polarity_scores(x)['compound'])

top_positive_users = df_copy.nlargest(10, 'sentiment_score')[['twitter_screen_name', 'sentiment_score', 'twitter_desc']]
print("\nTop 10 Most Positive Users:")
print(top_positive_users)

top_negative_users = df_copy.nsmallest(10, 'sentiment_score')[['twitter_screen_name', 'sentiment_score', 'twitter_desc']]
print("\nTop 10 Most Negative Users:")
print(top_negative_users)


Top 10 Most Positive Users:
      twitter_screen_name  sentiment_score  \
18496     JasonCrabbMusic           0.9859   
6986       IamWendyRaquel           0.9800   
18250         mousasi_mma           0.9785   
14603        LatitudeFest           0.9778   
7160       malillanymarin           0.9734   
21818           RLLracing           0.9725   
14255     StarburyMarbury           0.9718   
17410            petatodd           0.9716   
13004            streamys           0.9704   
9493            MooreMaya           0.9698   

                                            twitter_desc  
18496  HUSBAND. FATHER. CHILD OF GOD. LOVE PEOPLE. LO...  
6986   Lover of Life, Love and Laughter! Check out my...  
18250  *BELLATOR MW CHAMPION 2X\n*STRIKEFORCE LHW CHA...  
14603  Winner of the UK Festival Awards 'Best Major F...  
7160   http://Actress . TV host & model . Animal Righ...  
21818  1992 #IndyCar champion, 2004 #Indy500 winner f...  
14255  JESUS IS REAL! God Is Love, Love Is God.... 

Results
- moatly the results appear to be reasonable based on the sentiment scores calculated using vader_lexicon
- top negative description really looks negative, example: GraigKreindler	Graig Kreindler	Graig paints dead baseball players. Though not OF them dead. Not that there's anything wrong with the dead. In fact, Graig loves the Dead.
- top positive description looks positive, example: JasonCrabbMusic	Jason Crabb	HUSBAND. FATHER. CHILD OF GOD. LOVE PEOPLE. LOVE JESUS. LOVE IS STRONGER! @TheGRAMMYs award winner. :-)
- sentiment analysis might struggle with nuanced language due to the lack of context. For example, user in the 4th place of negative descriptions identifies himself as a 'life-lover,' while terms like 'anti-death penalty activist' and 'spiritual adviser to men and women on death row' suggest involvement with heavy and potentially negative topics. These complex sentiments not fully captured by the algorithm which places the user at the top of negative sentiments.


# 5. Hebrew, arabic characters in twitter_desc

In [None]:
def percentage_of_words_in_script(text, script_regex):
    # tokenize the text into words
    words = nltk.word_tokenize(str(text))

    # count the words containing characters in the specified script
    script_words_count = sum(1 for word in words if re.search(script_regex, word))

    # calculate the percentage
    percentage = (script_words_count / len(words)) * 100

    return percentage

# calculate the percentage of words containing Hebrew characters
hebrew_percentage = percentage_of_words_in_script(df['twitter_desc'].str.lower(), '[\u0590-\u05FF]')
print(f"The percentage of words containing Hebrew characters: {hebrew_percentage:.5f}%")

# calculate the percentage of words containing Arabic characters
arabic_percentage = percentage_of_words_in_script(df['twitter_desc'].str.lower(), '[\u0600-\u06FF]')
print(f"The percentage of words containing Arabic characters: {arabic_percentage:.5f}%")

The percentage of words containing Hebrew characters: 0.00000%
The percentage of words containing Arabic characters: 0.00000%


# 6. Emoticons

In [None]:
def extract_emoticons(text):
    emoticon_pattern = r'(?::|;|=)(?:-)?(?:\)|\(|D|P)'
    return re.findall(emoticon_pattern, text)

# Extract emoticons from 'twitter_desc' column and flatten the list
all_emoticons = df['twitter_desc'].apply(extract_emoticons).explode()

# Count the occurrences of each emoticon
emoticon_counts = Counter(all_emoticons)

# Print the top 10 most frequent emoticons
top_emoticons = emoticon_counts.most_common(10)
print("Top 10 Most Frequent Emoticons:")
for emoticon, count in top_emoticons:
    print(f"{emoticon}: {count}")

Top 10 Most Frequent Emoticons:
nan: 26490
:): 57
;): 11
:D: 9
:-): 7
:P: 7
;-): 4
:(: 2
;P: 1
