In [0]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import gensim
import scipy.sparse as sp
import scipy.linalg as sparcyLinalg
from nltk.stem import WordNetLemmatizer, SnowballStemmer

In [64]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [65]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
BASE_DIR = '/content/drive/My Drive/tfidf_cosine/'

In [0]:
DATASETS_PATH = BASE_DIR

In [0]:
def load_data(filename_train, filename_test, filename_val):
    """
    Load train, test and validation sets from file

    Args:
        filename_train : Name of the file from which the train data is to be loaded
        filename_test : Name of the file from which the test data is to be loaded
        filename_val : Name of the file from which the validation data is to be loaded
    
    Returns:
        train_tweet_X: list of tweets from train data
        train_tweet_Y: list of lables correponding to each tweet from train data
        test_tweet_X: list of tweets from test data
        test_tweet_Y: list of lables correponding to each tweet from test data
        val_tweet_X: list of tweets from validation data
        val_tweet_Y: list of lables correponding to each tweet from validation data
    """
    train = pd.read_csv(filename_train)
    test = pd.read_csv(filename_test)
    val = pd.read_csv(filename_val)
    
    return train['Text'].tolist(), train['Label'].tolist(), test['Text'].tolist(), test['Label'].tolist(), val['Text'].tolist(), val['Label'].tolist()

In [69]:
# LOADING TRAIN, TEST AND VALIDATION SETS

train_tweet_X, train_tweet_Y, test_tweet_X, test_tweet_Y, val_tweet_X, val_tweet_Y = load_data(DATASETS_PATH + 'train.csv', DATASETS_PATH + 'test.csv', DATASETS_PATH + 'val.csv')

print("Train length: \t", len(train_tweet_X))
print("Test length: \t", len(test_tweet_X))
print("Val length: \t", len(val_tweet_X))

Train length: 	 40231
Test length: 	 11833
Val length: 	 7100


In [0]:
def preprocessing(tweets):
    """
    Perform preprocessing of the tweets

    Args:
        tweets : list of tweets
    
    Returns:
        result: preprocessed list of tweets
    """
    #set of stopwords
    stop_words = set(stopwords.words('english'))

    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    
    result = []
    for tweet in tweets:
        
        #tokenizing each tweet
        tokens = word_tokenize(tweet)
        
        #removing stopwords and keeping words length greater than 2
        stopwords_removed_tokens = []
        for word in tokens:
            if word not in stop_words and len(word) > 2:
                stopwords_removed_tokens.append(word)
        
        #lemmatization and stemming
        lemmatized_tokens = []
        for word in stopwords_removed_tokens:
          lemmatized_tokens.append(stemmer.stem(lemmatizer.lemmatize(word)))

        
        result.append(lemmatized_tokens)
    
    return result

In [0]:
# PREPROCESSING TRAIN, TEST AND VALIDATION TWEETS

train_tweet_X = preprocessing(train_tweet_X)
test_tweet_X = preprocessing(test_tweet_X)
val_tweet_X = preprocessing(val_tweet_X)

In [0]:
#COMBINING TRAIN AND TEST TWEETS

combined_tweets_X = []
for tweet in train_tweet_X:
    combined_tweets_X.append(tweet)
for tweet in test_tweet_X:
    combined_tweets_X.append(tweet)

combined_Y = []
for label in train_tweet_Y:
    combined_Y.append(label)
for label in test_tweet_Y:
    combined_Y.append('UNKNOWN')

actual_combined_Y = []
for label in train_tweet_Y:
    actual_combined_Y.append(label)
for label in test_tweet_Y:
    actual_combined_Y.append(label)

In [0]:
# CREATING ARRAYS CONTAINING INDICES CORREPONDING TO EACH POLITICIANS TWEET IN combined_tweets_X
fawadchaudhry_tweets = []
maryamnsharif_tweets = []
mjibrannasir_tweets = []
narendramodi_tweets = []
sherryrehman_tweets = []

for i, label in enumerate(actual_combined_Y):
  if label == 'fawadchaudhry':
    fawadchaudhry_tweets.append(combined_tweets_X[i])
  elif label == 'maryamnsharif':
    maryamnsharif_tweets.append(combined_tweets_X[i])
  elif label == 'mjibrannasir':
    mjibrannasir_tweets.append(combined_tweets_X[i])
  elif label == 'narendramodi':
    narendramodi_tweets.append(combined_tweets_X[i])
  elif label == 'sherryrehman':
    sherryrehman_tweets.append(combined_tweets_X[i])

In [0]:
fawadchaudhry_dictionary = gensim.corpora.Dictionary(fawadchaudhry_tweets)
maryamnsharif_dictionary = gensim.corpora.Dictionary(maryamnsharif_tweets)
mjibrannasir_dictionary = gensim.corpora.Dictionary(mjibrannasir_tweets)
narendramodi_dictionary = gensim.corpora.Dictionary(narendramodi_tweets)
sherryrehman_dictionary = gensim.corpora.Dictionary(sherryrehman_tweets)

In [75]:
'''
Checking dictionaries created
'''
count = 0
for k, v in fawadchaudhry_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

count = 0
for k, v in maryamnsharif_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

count = 0
for k, v in mjibrannasir_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

count = 0
for k, v in narendramodi_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

count = 0
for k, v in sherryrehman_dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 baloch
1 cricket
2 jameel
3 one
4 page
5 pakistan
6 qndeel
7 tariq
8 unit
9 wct20
10 advoc
0 instead
1 mian
2 mosu
3 nawaz
4 other
5 pakistan
6 politician
7 popular
8 rule
9 sharif
10 abt
0 address
1 avenu
2 coal
3 concern
4 csr
5 develop
6 dir
7 goranno
8 mine
9 must
10 new
0 arm
1 day
2 donat
3 flag
4 forc
5 generous
6 personnel
7 rememb
8 valour
9 welfar
10 common
0 coverag
1 crisi
2 expand
3 factual
4 give
5 keep
6 minimum
7 mute
8 pindi
9 rise
10 rumor


In [0]:
'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
# fawadchaudhry_dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)
fawadchaudhry_dictionary.filter_extremes(no_below=3)
maryamnsharif_dictionary.filter_extremes(no_below=3)
mjibrannasir_dictionary.filter_extremes(no_below=3)
narendramodi_dictionary.filter_extremes(no_below=3)
sherryrehman_dictionary.filter_extremes(no_below=3)

In [0]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
fawadchaudhry_bow_corpus = [fawadchaudhry_dictionary.doc2bow(doc) for doc in fawadchaudhry_tweets]
maryamnsharif_bow_corpus = [maryamnsharif_dictionary.doc2bow(doc) for doc in maryamnsharif_tweets]
mjibrannasir_bow_corpus = [mjibrannasir_dictionary.doc2bow(doc) for doc in mjibrannasir_tweets]
narendramodi_bow_corpus = [narendramodi_dictionary.doc2bow(doc) for doc in narendramodi_tweets]
sherryrehman_bow_corpus = [sherryrehman_dictionary.doc2bow(doc) for doc in sherryrehman_tweets]

In [78]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 20
sherryrehman_bow_doc_x = sherryrehman_bow_corpus[document_num]

for i in range(len(sherryrehman_bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(sherryrehman_bow_doc_x[i][0], 
                                                     sherryrehman_dictionary[sherryrehman_bow_doc_x[i][0]], 
                                                     sherryrehman_bow_doc_x[i][1]))

Word 3 ("give") appears 1 time.
Word 49 ("spend") appears 1 time.
Word 91 ("forward") appears 1 time.
Word 97 ("realli") appears 1 time.
Word 149 ("better") appears 1 time.
Word 151 ("child") appears 1 time.
Word 157 ("neemtreeiftar") appears 1 time.
Word 225 ("abil") appears 1 time.
Word 226 ("access") appears 1 time.
Word 227 ("amaz") appears 1 time.
Word 228 ("hope") appears 1 time.
Word 229 ("look") appears 1 time.
Word 230 ("opportun") appears 1 time.
Word 231 ("potenti") appears 1 time.
Word 232 ("realis") appears 1 time.
Word 233 ("thank") appears 1 time.
Word 234 ("time") appears 1 time.


In [0]:
fawadchaudhry_model =  gensim.models.LdaMulticore(fawadchaudhry_bow_corpus, num_topics = 8, id2word = fawadchaudhry_dictionary, passes = 10, workers = 2)
maryamnsharif_model =  gensim.models.LdaMulticore(maryamnsharif_bow_corpus, num_topics = 8, id2word = maryamnsharif_dictionary, passes = 10, workers = 2)
mjibrannasir_model =  gensim.models.LdaMulticore(mjibrannasir_bow_corpus, num_topics = 8, id2word = mjibrannasir_dictionary, passes = 10, workers = 2)
narendramodi_model =  gensim.models.LdaMulticore(narendramodi_bow_corpus, num_topics = 8, id2word = narendramodi_dictionary, passes = 10, workers = 2)
sherryrehman_model =  gensim.models.LdaMulticore(sherryrehman_bow_corpus, num_topics = 8, id2word = sherryrehman_dictionary, passes = 10, workers = 2)

In [80]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
print("fawadchaudhry")
for idx, topic in fawadchaudhry_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")
print("maryamnsharif")
for idx, topic in maryamnsharif_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")
print("mjibrannasir")
for idx, topic in mjibrannasir_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")
print("narendramodi")
for idx, topic in narendramodi_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")
print("sherryrehman")
for idx, topic in sherryrehman_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

fawadchaudhry
Topic: 0 
Words: 0.053*"hai" + 0.024*"tou" + 0.019*"hein" + 0.019*"mein" + 0.019*"aur" + 0.018*"kia" + 0.017*"app" + 0.016*"nahi" + 0.013*"nai" + 0.013*"yeah"


Topic: 1 
Words: 0.017*"pak" + 0.010*"must" + 0.009*"thi" + 0.007*"ppp" + 0.007*"world" + 0.007*"polit" + 0.006*"peopl" + 0.006*"knw" + 0.005*"india" + 0.005*"like"


Topic: 2 
Words: 0.014*"pti" + 0.011*"ppp" + 0.011*"one" + 0.009*"pak" + 0.009*"kill" + 0.009*"support" + 0.008*"must" + 0.008*"mqm" + 0.007*"pmln" + 0.007*"tht"


Topic: 3 
Words: 0.011*"cricket" + 0.009*"one" + 0.007*"pti" + 0.007*"cong" + 0.006*"report" + 0.005*"old" + 0.005*"provinc" + 0.005*"state" + 0.005*"parti" + 0.005*"wish"


Topic: 4 
Words: 0.031*"pak" + 0.009*"medium" + 0.008*"see" + 0.007*"even" + 0.007*"pti" + 0.007*"armi" + 0.006*"want" + 0.006*"state" + 0.006*"use" + 0.006*"like"


Topic: 5 
Words: 0.013*"tht" + 0.013*"elect" + 0.011*"case" + 0.009*"judg" + 0.009*"pak" + 0.009*"shld" + 0.008*"cjp" + 0.008*"need" + 0.007*"court" + 0.0

In [62]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
print("fawadchaudhry")
for idx, topic in fawadchaudhry_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")
print("maryamnsharif")
for idx, topic in maryamnsharif_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")
print("mjibrannasir")
for idx, topic in mjibrannasir_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")
print("narendramodi")
for idx, topic in narendramodi_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")
print("sherryrehman")
for idx, topic in sherryrehman_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

fawadchaudhry
Topic: 0 
Words: 0.049*"hai" + 0.031*"hein" + 0.017*"app" + 0.015*"mein" + 0.014*"yeah" + 0.012*"tou" + 0.011*"medium" + 0.011*"aur" + 0.010*"nai" + 0.009*"bhai"


Topic: 1 
Words: 0.020*"kill" + 0.015*"say" + 0.011*"right" + 0.010*"thi" + 0.010*"taliban" + 0.010*"woman" + 0.010*"drone" + 0.008*"see" + 0.008*"support" + 0.008*"condemn"


Topic: 2 
Words: 0.019*"pti" + 0.019*"govt" + 0.019*"ppp" + 0.013*"tht" + 0.012*"pmln" + 0.009*"state" + 0.009*"must" + 0.009*"thi" + 0.009*"polit" + 0.008*"support"


Topic: 3 
Words: 0.021*"tht" + 0.018*"shld" + 0.013*"must" + 0.011*"hve" + 0.011*"decis" + 0.010*"day" + 0.009*"knw" + 0.008*"state" + 0.008*"make" + 0.007*"ask"


Topic: 4 
Words: 0.018*"peopl" + 0.011*"india" + 0.010*"like" + 0.009*"world" + 0.009*"armi" + 0.009*"state" + 0.008*"pti" + 0.008*"wht" + 0.007*"ppp" + 0.007*"best"


Topic: 5 
Words: 0.013*"case" + 0.012*"judg" + 0.011*"court" + 0.011*"law" + 0.011*"time" + 0.011*"one" + 0.010*"sharif" + 0.010*"need" + 0.010*"c

In [0]:
#INFERRING TOPICS FROM THE OUTPUT ABOVE:

#FAWAD CHAUDHRY
# Topic 0: roman urdu words
# Topic 1: terrorists
# Topic 2: political parties
# Topic 3: decisiveness
# Topic 4: india army
# Topic 5: judiciary
# Topic 6: pti govt
# Topic 7:

#MARYAM NAWAZ
# Topic 0: govt
# Topic 1: nawaz sharif meet pti
# Topic 2: election
# Topic 3: prime minister nawaz sharif press conference(meeting people answer questions)
# Topic 4: thanking people
# Topic 5: pml wishing well for pakistan 
# Topic 6: thanking god
# Topic 7: businessman support for political party

#M JIBRAN NASIR
# Topic 0: case where police kills people
# Topic 1: elections
# Topic 2: court case abdul aziz
# Topic 3: pakistani family support student
# Topic 4: law making
# Topic 5: roman urdu words
# Topic 6: support karachi with relief
# Topic 7:protest

#MODI
# Topic 0: inspire effort for contribution to bharat
# Topic 1: peace
# Topic 2: wishing people
# Topic 3: meeting the president
# Topic 4: thanking farmers in gujrat
# Topic 5: congratulating indian team
# Topic 6: 
# Topic 7:

#SHERRY REHMAN
# Topic 0: 
# Topic 1: 
# Topic 2: 
# Topic 3: 
# Topic 4: 
# Topic 5: 
# Topic 6: 
# Topic 7: