In [214]:
import pandas as pd

##### Reading Data

In [215]:
data = pd.read_csv('Twitter_data.csv')
# data.sort_values("twitter_screen_name", inplace = True)
data.drop_duplicates(subset ="twitter_screen_name", keep = "first", inplace = True)
data

Unnamed: 0,twitter_screen_name,twitter_name,twitter_desc,twitter_location,wikidata_desc,type1,type2
0,BarackObama,Barack Obama,"Dad, husband, President, citizen.","Washington, DC",44th president of the United States,Politician,Person
1,justinbieber,Justin Bieber,#Changes out now,,Canadian singer,MusicalArtist,Person
2,katyperry,KATY PERRY,Love. Light.,,American singer,MusicalArtist,Person
3,rihanna,Rihanna,,,"Barbadian singer, songwriter, and businesswoman",MusicalArtist,Person
4,taylorswift13,Taylor Swift,Lover out now,,American singer-songwriter,MusicalArtist,Person
...,...,...,...,...,...,...,...
29058,JessalynGilsig,Jessalyn Gilsig,Big Shot Vikings Glee Scandal Nip Tuck and any...,,actress,Person,Person
29059,ReemaMajor,Reema Major,Dey Say Only Da Good Die Young So Fuck It ima ...,South Sudan - UAE,Canadian rapper,MusicalArtist,Person
29060,EqualityTexas,Equality Texas,We work to secure full equality for LGBTQ Texa...,"Austin, Texas",,Organisation,Organisation
29061,Margus_Hunt,Margus Hunt,Official Twitter page of Margus Hunt,"Indianapolis, IN","Estonian track and field athlete, American foo...",AmericanFootballPlayer,Person


##### Most Frequent  Words in twitter_desc

In [216]:
import nltk

In [218]:
twitter_desc = data[data['twitter_desc'].notna()]['twitter_desc'].tolist()
twitter_desc = ' '.join(twitter_desc) # convert list to string 

###### data cleansing

In [219]:
# imports
english_vocab = sorted(set(w.lower() for w in nltk.corpus.words.words()))
stopwords = nltk.corpus.stopwords.words('english')
def cleanse_data(s):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    # tokenize the input string 
    s = list(nltk.tokenize.word_tokenize(s))
    res = []
#     iterate over words in string 
    for word in s:
        if word.lower() in english_vocab and word.lower() not in stopwords: # if words exists in english_ vocab and doesnt exist in stopwords
            res.append(lemmatizer.lemmatize(word.lower()))                  # perform lemmatization, and then add it to the final result list.  
    return res

30 Most Common Words in twitter_desc column

In [548]:
s = cleanse_data(twitter_desc)
pd.DataFrame(FreqDist(s).most_common(30), columns = ['word', 'frequency']).to_csv('question1.csv')
pd.DataFrame(FreqDist(s).most_common(30), columns = ['word', 'frequency'])

Unnamed: 0,word,frequency
0,official,2741
1,twitter,2344
2,news,1784
3,new,1423
4,world,1384
5,account,1343
6,author,1143
7,u,966
8,de,826
9,follow,731


# Male, Female Counts

In [480]:
names = nltk.corpus.names
names.fileids()
male_names = names.words('male.txt')
female_names = names.words('female.txt')
male_names = [w.lower() for w in male_names]
female_names = [w.lower() for w in female_names]

In [500]:
full_names = data[data['twitter_name'].notna()]['twitter_name'].tolist()
count_male, count_female = 0, 0
for name in full_names:
    first_name = name.split(' ')[0].lower()
    if first_name in male_names and first_name not in female_names:
        count_male += 1
    elif first_name in female_names and first_name not in male_names: 
        count_female += 1

In [501]:
{'Female count' : count_female
, 'Male Count' : count_male}

{'Female count': 3533, 'Male Count': 6334}

In [502]:
df_data = data[data['twitter_name'].notna() & data['twitter_desc'].notna()]

words_male = []
words_female = []

for index, row in df_data.iterrows():
    
    name = row['twitter_name'].split(' ')[0]
    if name in male_names and name not in female_names:
        words_male.extend(cleanse_data(row['twitter_desc']))
    elif name in female_names and name not in male_names: 
        words_female.extend(cleanse_data(row['twitter_desc']))

#### Top 30 Words describe Male:

In [544]:
pd.DataFrame(FreqDist(words_male).most_common(30), columns =['word', 'frequency'])

Unnamed: 0,word,frequency
0,actor,13
1,author,12
2,twitter,11
3,new,11
4,de,10
5,official,9
6,producer,9
7,host,8
8,like,7
9,world,7


#### Top 30 Words describe Female:

In [543]:
pd.DataFrame(FreqDist(words_female).most_common(30), columns =['word', 'frequency']).to_csv('women.csv')
pd.DataFrame(FreqDist(words_female).most_common(30), columns =['word', 'frequency'])

Unnamed: 0,word,frequency
0,actress,17
1,writer,13
2,author,12
3,world,11
4,actor,11
5,time,9
6,host,8
7,love,8
8,producer,8
9,official,7


We convert the freqDist to a dictionary so we can get the frequency of each word 

In [506]:
words_male = dict(FreqDist(words_male))
words_female = dict(FreqDist(words_female))

##### PMI

In [507]:
from  math import log2

def Pr_wClass(word,class_words): #class is by default both, else male or female
    class_len = sum(class_words.values())# we calculate the sum of frequencies for each class
    if word not in class_words:
        
        return 0
## if word is in class_1 words we add it's frequency so we can calculate it's proprortion next
# calculating the proportion of the word in a class (how many times it occured/the whole sum of frequencies of the class).
    return class_words[word] / class_len 

# what is the proportion between the class and sum of the classes.
def pr_class(class_):
    if class_ == 'male':
        return count_male/(count_male+count_female)
    elif class_ == 'female':
        return count_female/(count_male+count_female)
    elif class_ == 'musical_artist':
        return count_musical_artists / (count_musical_artists + count_non_musical_artists)
    elif class_ == 'non_musical_artists': 
        return count_non_musical_artists / (count_musical_artists + count_non_musical_artists)
    elif class_ == 'politicians':
        return count_politicians / (count_politicians + count_non_politicians)
    return count_non_politicians / (count_politicians + count_non_politicians)



def pr_w(word, class1_words, class2_words):
    res = 0 
    if word in class1_words: # if word in words of male add to it's frequency to result 
        res += class1_words[word]
    if word in class2_words: # if word in words of female add it's frequency to result.
        res += class2_words[word]
    return res/(sum(class1_words.values()) + sum(class2_words.values()))


# how much a word describe a class. 
def PMI(word , class1_words,class2_words,class_):
    if class_ == 'male' or class_ =='musical_artist' or class_ == 'politicians':
        score = Pr_wClass(word,class1_words)/(pr_w(word,class1_words,class2_words)*pr_class(class_))
        return  log2(score) if score > 0 else 0
    score =  Pr_wClass(word,class2_words)/(pr_w(word,class1_words,class2_words)*pr_class(class_))
    return log2(score) if score > 0 else 0

calculating pmi for male and female words

In [514]:
male_pmi, female_pmi = [],[] # empty lists to store the pmi calculations

for word in words_male.keys(): # iterate over all the words and calculate pmi
    male_pmi.append((word, PMI(word,words_male,words_female  ,'male'))) # we insert to the male_pmi list a tuple with two values: (word, pmi_score)
# the same operation as above.
for word in words_female.keys():
    female_pmi.append((word,PMI(word,words_male,words_female ,'female')))


###### Words that best describe Male:

In [546]:
# we sort the list of tuples by the 
pd.DataFrame(sorted(male_pmi,key = lambda x: x[1], reverse = True)[:30], columns = ['word', 'PMI']).to_csv('male_pmi.csv')
pd.DataFrame(sorted(male_pmi,key = lambda x: x[1], reverse = True)[:30], columns = ['word', 'PMI'])

Unnamed: 0,word,PMI
0,astrophysicist,1.667551
1,build,1.667551
2,believe,1.667551
3,future,1.667551
4,pursue,1.667551
5,happiness,1.667551
6,turned,1.667551
7,productor,1.667551
8,conductor,1.667551
9,presidente,1.667551


###### Words that best describe female

In [547]:
pd.DataFrame(sorted(female_pmi,key = lambda x: x[1], reverse = True)[:30], columns = ['word', PMI]).to_csv('female_pmi.csv')
pd.DataFrame(sorted(female_pmi,key = lambda x: x[1], reverse = True)[:30], columns = ['word', PMI])

Unnamed: 0,word,<function PMI at 0x0000027A3941B9D0>
0,queen,2.454197
1,part,2.454197
2,solo,2.454197
3,armor,2.454197
4,cook,2.454197
5,ray,2.454197
6,feed,2.454197
7,water,2.454197
8,melanin,2.454197
9,blood,2.454197


# Conclusions 

-  We can clearly notice that the words that describe each class are different when we use PMI score instead of the count of occurences.
- The words that describe each class changed.
- all top words have the same pmi score, although it's unlikely for them to have the same frequency. 
- We can use these words for further use, e.g., we could use them to predict unseen user's class by looking at the words mentioned in twitter_desc.
- values in the first section of are the frequency of each word, while using pmi score we are looking at each word and given it a score on how much it can be affiliated to the class. then, we can use this to know a twitter_desc is 

# What Best Describes By PMI Score

count the values of each class

In [517]:
count_musical_artists =  len(data[data['type1'] == 'MusicalArtist'])
count_non_musical_artists = len(data[(data['type1'] != 'MusicalArtist') & (data['type1'].notna())]) 

In [518]:
count_politicians =  len(data[data['type1'] == 'Politician'])
count_non_politicians = len(data[(data['type1'] != 'Politician') & (data['type1'].notna())]) 

###### Words that best describe MusicalArtists

getting words of musicalArtists

In [519]:
## getting the words that describe music artists from the data that have twitter_desc column not null.
music_artists_words = data[(data['type1'] == 'MusicalArtist') & (data['twitter_desc'].notna())]['twitter_desc'].tolist()
## we got the words as list of strings, let's convert it to a single string so we can then apply cleanse_data on a single string,.
music_artists_words = ' '.join(music_artists_words)
# let's get the words that describe the musical artists as a list by applying cleanse_data
music_artists_words = cleanse_data(music_artists_words)
# music_artists_words

getting words of non-musicalArtists

In [520]:
## getting the words that describe music artists from the data that have twitter_desc column not null.
not_musicart_words = data[(data['type1'] != 'MusicalArtist') & (data['twitter_desc'].notna())]['twitter_desc'].tolist()
## we got the words as list of strings, let's convert it to a single string so we can then apply cleanse_data on a single string,.
not_musicart_words = ' '.join(not_musicart_words)
# let's get the words that describe the musical artists as a list by applying cleanse_data
not_musicart_words = cleanse_data(not_musicart_words)

Best Words That Describe MusicalArtists

In [535]:
music_artists_words = dict(FreqDist(music_artists_words))
non_music_artists_words = dict(FreqDist(not_musicart_words))

pmi_music_artists, pmi_non_music_artists = [],[]

for word in music_artists_words.keys():
    pmi_music_artists.append((word, PMI(word,music_artists_words,non_music_artists_words,  'musical_artist'))) 
    
for word in non_music_artists_words.keys():
    pmi_non_music_artists.append((word,PMI(word,music_artists_words,non_music_artists_words, 'non_musical_artists')))

Top 30 Words Of MusicArtists

In [549]:
pd.DataFrame(sorted(pmi_music_artists,key = lambda x: x[1], reverse = True)[:30], columns = ['word', 'pmi']).to_csv('topMusic.csv')
pd.DataFrame(sorted(pmi_music_artists,key = lambda x: x[1], reverse = True)[:30], columns = ['word', 'pmi'])

Unnamed: 0,word,pmi
0,owe,6.415404
1,indigo,6.415404
2,playback,6.415404
3,relation,6.415404
4,dura,6.415404
5,melanin,6.415404
6,opus,6.415404
7,contradict,6.415404
8,thalia,6.415404
9,harass,6.415404


Top 30 Words of non MusicArtists

In [550]:
pd.DataFrame(sorted(pmi_non_music_artists,key = lambda x: x[1], reverse = True)[:30], columns = ['word', 'pmi']).to_csv('topNonMusic.csv')
pd.DataFrame(sorted(pmi_non_music_artists,key = lambda x: x[1], reverse = True)[:30], columns = ['word', 'pmi'])

Unnamed: 0,word,pmi
0,privacy,0.336608
1,trucker,0.336608
2,spectacular,0.336608
3,cotton,0.336608
4,restock,0.336608
5,program,0.336608
6,astrophysicist,0.336608
7,foundation,0.336608
8,difficult,0.336608
9,kevin,0.336608


# Conclusions of Music Artists

#### Politicians

In [524]:
## getting the words that describe music artists from the data that have twitter_desc column not null.
politicians_words = data[(data['type1'] == 'Politician') & (data['twitter_desc'].notna())]['twitter_desc'].tolist()
## we got the words as list of strings, let's convert it to a single string so we can then apply cleanse_data on a single string,.
politicians_words = ' '.join(politicians_words)
# let's get the words that describe the musical artists as a list by applying cleanse_data
politicians_words = cleanse_data(politicians_words)


getting words of non-politicians

In [525]:
## getting the words that describe music artists from the data that have twitter_desc column not null.
not_politicians_words = data[(data['type1'] != 'Politician') & (data['twitter_desc'].notna())]['twitter_desc'].tolist()
## we got the words as list of strings, let's convert it to a single string so we can then apply cleanse_data on a single string,.
not_politicians_words = ' '.join(not_politicians_words)
# let's get the words that describe the musical artists as a list by applying cleanse_data
not_politicians_words = cleanse_data(not_politicians_words)

Best Words That Describe Politicians

In [531]:
politicians_words = dict(FreqDist(politicians_words))
not_politicians_words = dict(FreqDist(not_politicians_words))

pmi_politicians, pmi_non_politicians = [],[]

for word in politicians_words.keys():
    pmi_politicians.append((word, PMI(word,politicians_words,not_politicians_words,  'politicians'))) 
    
for word in not_politians_words.keys():
    pmi_non_politicians.append((word,PMI(word,politicians_words,not_politicians_words, 'non_politicians')))

Top 30 Words Of Politicians

In [551]:
pd.DataFrame(sorted(pmi_politicians,key = lambda x: x[1], reverse = True)[:30], columns = ['word', 'PMI']).to_csv('politicians.csv')
pd.DataFrame(sorted(pmi_politicians,key = lambda x: x[1], reverse = True)[:30], columns = ['word', 'PMI'])

Unnamed: 0,word,PMI
0,senator,8.703561
1,congresswoman,8.703561
2,monument,8.703561
3,rahul,8.703561
4,wealthy,8.703561
5,lilian,8.703561
6,mari,8.703561
7,amelia,8.703561
8,soldado,8.703561
9,vasundhara,8.703561


Top 30 Words of non Politicians

In [552]:
pd.DataFrame(sorted(pmi_non_politicians,key = lambda x: x[1], reverse = True)[:30], columns = ['word', 'PMI']).to_csv('noPoliticians.csv')
pd.DataFrame(sorted(pmi_non_politicians,key = lambda x: x[1], reverse = True)[:30], columns = ['word', 'PMI'])

Unnamed: 0,word,PMI
0,light,0.145906
1,collection,0.145906
2,part,0.145906
3,net,0.145906
4,rare,0.145906
5,custom,0.145906
6,astrophysicist,0.145906
7,learning,0.145906
8,hart,0.145906
9,everybody,0.145906


# Conclusions of Politicians

# Sentiment Score - twitter_desc

Downloading opinion lexicon

In [111]:
# nltk.download('opinion_lexicon')

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\mutla\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\opinion_lexicon.zip.


True

In [113]:
from nltk.corpus import opinion_lexicon

pos_list=set(opinion_lexicon.positive())
neg_list=set(opinion_lexicon.negative())


cleaning words of the two lists, removing things such '-' , ',' ...

In [136]:

positive, negative = [], []
for i in pos_list:
    positive.extend(cleanse_data(i))

for i in neg_list:
    negative.extend(cleanse_data(i))

Sentiment Score: (count of pos_words - count of neg_words)/count of words
- pos_words: positive words in description
- neg_words: negative words in description
- count of words: total count of words in description

Sentiment Score Calculation:

In [200]:
def sentiment_score(string):
    no = ['∼', '°', '″', '×', '…', '!', 'χ', '+', '‘','’',
        '—', '~', '>', '|', '<', '•', '→', '$', '\uf08e', '⋆', '´','•', '¨', 'ˆ', '′', '∑', '‚', '»',
        '«', '²', '€', '≥', '„', '›', '‹', '−', '¼', '£', 'ð', 'Þ', '\uf0b7', '\uf6bc', '∗', '†',
        '◦', '·', '±', '≤', '∏', 'º', '`', '�', '‡', '§', '̸', '⃗', '≪', 'ϕ', 'θ', '®', '≈', 'â', 
        'Ă', 'Ş', '∅', '™', '‐', '\x00', '（', '∕', '）', '𝑚', '\\', '\uffff','&', '–','?','#', '%',
        '=', '∀', '∈', '{', '}', '“', '”', '_', '*', '©',']','``', '’' '"', '[', ':', ';','/', '(', ')',
        "'",'-','.','@',',', '\r', '\n']
    string = [w for w in nltk.tokenize.word_tokenize(string) if w not in no]
    pos_words, neg_words,count_words = 0,0, len(string)
    if count_words ==0:
        return 0
    for word in string:
        if word in positive:
            pos_words += 1
        elif word in negative:
            neg_words += 1
    return (pos_words - neg_words)/count_words

In [209]:
descriptions = data[data['twitter_desc'].notna()]['twitter_desc'].tolist()

scores = []
for desc in descriptions:
    scores.append((desc, sentiment_score(desc)))


Top 10 positive Descriptions:

In [208]:
pd.DataFrame(sorted(scores,key = lambda x: x[1], reverse = True)[:10], columns =['Description', 'Score'])

Unnamed: 0,Description,Score
0,work,1.0
1,whoa,1.0
2,whoa,1.0
3,autonomous,1.0
4,patience,1.0
5,very stable genius,0.666667
6,charm type,0.5
7,Raw polished,0.5
8,music enthusiast,0.5
9,lead @GenesisNovels,0.5


Top 10 Negative Descriptions

In [210]:
pd.DataFrame(sorted(scores,key = lambda x: x[1], reverse = True)[-10:], columns =['Description',  'Score'])

Unnamed: 0,Description,Score
0,so bomb,-0.5
1,true untrue untrue ...,-0.5
2,crisis actress,-0.5
3,jaded. Celtic,-0.5
4,beach bum!,-0.5
5,bad music™️,-0.5
6,vengeful ghost,-0.5
7,rude lil pig,-0.666667
8,"death,grind,gore",-0.666667
9,audacity,-1.0


# Arabic and Hebrew Letters in Descriptions

In [362]:
# function that checks if a string contains hebrew characters.
def check_hebrew(string):
    return any("\u0590" <= c <= "\u05EA" for c in string)
def check_arabic(string):
    return any("\u0600" <= c <= "\u06FF" for c in string)

True

In [554]:
count_arabic = 0
count_hebrew = 0
for desc in descriptions:
    if check_arabic(desc):
        count_arabic += 1
    if check_hebrew(desc):
        count_hebrew += 1
count = count_arabic + count_hebrew
print('Number Of Descriptions that contains Hebrew characters', str(count_hebrew))
print('Percentage of the descriptions that contains Hebrew characters:',(str(format((100*count_hebrew/len(descriptions)), '.4f'))) , '%' )
print('-------------------------------------------------------')
print('Number Of Descriptions that contains Arabic characters', str(count_arabic))
print('Percentage of the descriptions that contains Arabic characters:',(str(format((100*count_arabic/len(descriptions)), '.4f'))) , '%' )



Number Of Descriptions that contains Hebrew characters 10
Percentage of the descriptions that contains Hebrew charcters: 0.0393 %
-------------------------------------------------------
Number Of Descriptions that contains  Arabic characters 54
Percentage of the descriptions that contains Arabic charcters: 0.2120 %


# Emojies in Descriptions

In [383]:
def check_emjoi(char):
    range_min = ord(u'\U0001F300') # 127744
    range_max = ord(u"\U0001FAF6") # 129782
    if ord(char) >= range_min and ord(char) <= range_max:
        return True
    return False

In [387]:
emojis = {}
str_desc = ''.join(descriptions)
for c in str_desc:
    if check_emjoi(c):
        if c in emojis:
            emojis[c] += 1
        else:
            emojis[c] = 1
emojis

{'🎙': 41,
 '📲': 42,
 '🌈': 85,
 '🏆': 147,
 '🙌': 29,
 '🏼': 143,
 '👟': 6,
 '📽': 15,
 '🤝': 9,
 '👕': 7,
 '📊': 6,
 '🌱': 24,
 '🎮': 35,
 '🌙': 8,
 '📱': 27,
 '👉': 84,
 '🌊': 22,
 '🦄': 7,
 '🔟': 2,
 '📧': 36,
 '💕': 35,
 '💗': 43,
 '🤷': 15,
 '🏻': 159,
 '🎤': 45,
 '🌟': 23,
 '🎵': 15,
 '💄': 10,
 '🐶': 29,
 '🎥': 46,
 '📸': 46,
 '🏳': 47,
 '🏴': 22,
 '💫': 50,
 '💜': 35,
 '🌺': 4,
 '📺': 64,
 '👩': 42,
 '💻': 31,
 '💔': 4,
 '🏀': 47,
 '🏈': 34,
 '🏞': 1,
 '💥': 33,
 '🎧': 32,
 '🎶': 54,
 '👶': 21,
 '🌎': 65,
 '🤲': 3,
 '🐷': 4,
 '🙃': 3,
 '👇': 168,
 '🖤': 45,
 '🍊': 7,
 '🕵': 5,
 '👱': 3,
 '🥋': 4,
 '🤾': 2,
 '🔥': 94,
 '💎': 14,
 '👋': 10,
 '👻': 50,
 '💦': 4,
 '🔮': 10,
 '🙏': 66,
 '😍': 11,
 '👏': 3,
 '🏾': 108,
 '😊': 11,
 '💯': 12,
 '💰': 12,
 '📚': 48,
 '🎭': 15,
 '🪐': 3,
 '🛸': 9,
 '🦁': 15,
 '🏠': 14,
 '🧪': 1,
 '🚀': 26,
 '🐨': 4,
 '📨': 4,
 '💃': 14,
 '💛': 42,
 '🛌': 1,
 '🎨': 14,
 '🐱': 3,
 '📍': 23,
 '🎬': 24,
 '🕊': 12,
 '💙': 39,
 '🌏': 17,
 '🔑': 4,
 '🌍': 38,
 '😄': 8,
 '🗣': 19,
 '🎉': 16,
 '🐬': 3,
 '🌸': 27,
 '🧛': 4,
 '🌿': 10,
 '🖍': 1,
 '📷': 28,
 '💌': 8

In [403]:
emojis = {k: v for k, v in sorted(emojis.items(), key=lambda item: item[1], reverse = True)}

In [397]:
import demoji
demoji.download_codes()

  demoji.download_codes()


In [410]:
emoji_values = []
for em in emojis:
        emoji_values.append((em[0], demoji.findall(em)[em], emojis[em[0]]))
pd.DataFrame(emoji_values, columns = ["Emoji", "Emoji Description", "Occurences"])[:10]

Unnamed: 0,Emoji,Emoji Description,Occurences
0,👇,backhand index pointing down,168
1,🏻,light skin tone,159
2,🏆,trophy,147
3,🏼,medium-light skin tone,143
4,🏾,medium-dark skin tone,108
5,🏽,medium skin tone,101
6,🔥,fire,94
7,🌈,rainbow,85
8,👉,backhand index pointing right,84
9,🙏,folded hands,66
