### 3 Mining Text Data

In [1]:
# imports
import sys
import csv
import math
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.stem import PorterStemmer
from collections import Counter

import requests

In [2]:
# Return a pandas dataframe containing the data set.
# Specify a 'latin-1' encoding when reading the data.
# data_file will be populated with the string 'wholesale_customers.csv'.
def read_csv_3(data_file):
    
    df = pd.read_csv(data_file,index_col=False, encoding = 'latin-1')
    
    return df

In [3]:
# Return a list with the possible sentiments that a tweet might have.
def get_sentiments(df):
    
    sentiments = df['Sentiment'].unique().tolist() 
        
    return sentiments

In [4]:
# Return a string containing the second most popular sentiment among the tweets.
def second_most_popular_sentiment(df):
    
  
    total_Counts = df.groupby('Sentiment')['Sentiment'].count().reset_index(name='Count')\
                    .sort_values(['Count'], ascending=False).reset_index(drop=True)
    # print(total_Counts)
    
    second = total_Counts['Sentiment'][1]
    
    return second

In [5]:
# Return the date (string as it appears in the data) with the greatest number of extremely positive tweets.
def date_most_popular_tweets(df):
    
    total_Counts = df.groupby(['TweetAt','Sentiment'])['TweetAt'].count().reset_index(name='Count')\
                    .sort_values(['Count'], ascending=False).reset_index(drop=True)
    # print(total_Counts)
    
    # date  = total_Counts[['TweetAt', 'Sentiment', 'Count']][total_Counts['Sentiment'] == 'Positive'].reset_index(drop=True)
    date  = total_Counts['TweetAt'][total_Counts['Sentiment'] == 'Positive'].reset_index(drop=True)[0]
    
    return date

In [6]:
# Modify the dataframe df by converting all tweets to lower case. 
def lower_case(df):
    
    df['OriginalTweet'] = df['OriginalTweet'].str.lower()

In [7]:
# Modify the dataframe df by replacing each characters which is not alphabetic or whitespace with a whitespace.
def remove_non_alphabetic_chars(df):
    
    df['OriginalTweet'] = df['OriginalTweet'].str.replace(r'[^a-zA-Z]', ' ', regex=True)

In [8]:
# Modify the dataframe df with tweets after removing characters which are not alphabetic or whitespaces.
def remove_multiple_consecutive_whitespaces(df):
    
    df['OriginalTweet'] = df['OriginalTweet'].replace(r'\s+', ' ', regex=True)

In [9]:
# Given a dataframe where each tweet is one string with words separated by single whitespaces,
# tokenize every tweet by converting it into a list of words (strings).
def tokenize(df):
    
    df['OriginalTweet'] = df.apply(lambda row: nltk.word_tokenize(row['OriginalTweet']), axis=1)

In [10]:
# Given dataframe tdf with the tweets tokenized, return the number of words in all tweets including repetitions.
def count_words_with_repetitions(tdf):

    count_words = tdf['OriginalTweet'].str.len().sum()
    
    return count_words

In [11]:
# Given dataframe tdf with the tweets tokenized, return the number of distinct words in all tweets.
def count_words_without_repetitions(tdf):
    
    count_words = sum(tdf['OriginalTweet'].apply(set).apply(len))
    
    return count_words

In [25]:
# Given dataframe tdf with the tweets tokenized, return a list with the k distinct words that are most frequent in the tweets.
def frequent_words(tdf,k):
    
    split_it = ''.join([' '.join(wrd for wrd in x) for x in tdf['OriginalTweet']]).split()
    Count = Counter(split_it)
    most_occur = Count.most_common(k)
    
    return [most_occur[x][0] for x in range(k)]

In [13]:
# Given dataframe tdf with the tweets tokenized, remove stop words and words with <=2 characters from each tweet.
# The function should download the list of stop words via:
# https://raw.githubusercontent.com/fozziethebeat/S-Space/master/data/english-stop-words-large.txt
def remove_stop_words(tdf):
    
    x = requests.get('https://raw.githubusercontent.com/fozziethebeat/S-Space/master/data/english-stop-words-large.txt')
    stopWords = (x.text).strip('][').split('\n')
          
    tdf['OriginalTweet'] = tdf['OriginalTweet'].apply(lambda x: \
                            [word for word in x if (word not in (stopWords) and len(word)>2)]) 

In [14]:
# Given dataframe tdf with the tweets tokenized, reduce each word in every tweet to its stem.
def stemming(tdf):
    
    ps = PorterStemmer()
    
    tdf['OriginalTweet'] = tdf['OriginalTweet'].apply(lambda x: \
                            [ps.stem(word) for word in x])     

In [15]:
# Given a pandas dataframe df with the original coronavirus_tweets.csv data set,
# build a Multinomial Naive Bayes classifier. 
# Return predicted sentiments (e.g. 'Neutral', 'Positive') for the training set
# as a 1d array (numpy.ndarray). 
def mnb_predict(df):
	pass

In [16]:
# Given a 1d array (numpy.ndarray) y_pred with predicted labels (e.g. 'Neutral', 'Positive') 
# by a classifier and another 1d array y_true with the true labels, 
# return the classification accuracy rounded in the 3rd decimal digit.
def mnb_accuracy(y_pred,y_true):
	pass

In [17]:
# 1. [13 points] Compute the possible sentiments that a tweet may have, the second most popular
# sentiment in the tweets, and the date with the greatest number of extremely positive tweets.
# Next, convert the messages to lower case, replace non-alphabetical characters with whitespaces
# and ensure that the words of a message are separated by a single whitespace.

# Read data 
df = read_csv_3('Corona_NLP_train.csv')
df.head()

# possible sentiments that a tweet may have
print('1.1. Possible sentiments that a tweet may have: ')
sentiments = get_sentiments(df)
print(sentiments)

# the second most popular sentiment in the tweets
print('1.2. The second most popular sentiment in the tweets: ')
second_sentiment = second_most_popular_sentiment(df)
print(second_sentiment)

# the date with the greatest number of extremely positive tweets
print('1.3. The date with the greatest number of extremely positive tweets: ')
date_tweets = date_most_popular_tweets(df)
print(date_tweets)

# convert the messages to lower case
lower_case(df)

# replace non-alphabetical characters with whitespaces
remove_non_alphabetic_chars(df)

# ensure that the words of a message are separated by a single whitespace.
remove_multiple_consecutive_whitespaces(df)
df.head()

1.1. Possible sentiments that a tweet may have: 
['Neutral', 'Positive', 'Extremely Negative', 'Negative', 'Extremely Positive']
1.2. The second most popular sentiment in the tweets: 
Negative
1.3. The date with the greatest number of extremely positive tweets: 
20-03-2020


Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,menyrbie phil gahan chrisitv https t co ifz f...,Neutral
1,3800,48752,UK,16-03-2020,advice talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,coronavirus australia woolworths to give elder...,Positive
3,3802,48754,,16-03-2020,my food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,me ready to go at supermarket during the covid...,Extremely Negative


In [18]:
# 2. [14 points] Tokenize the tweets (i.e. convert each into a list of words), count the total number
# of all words (including repetitions), the number of all distinct words and the 10 most frequent
# words in the corpus. Remove stop words, words with ≤ 2 characters, and reduce each word to
# its stem. You are now able to recompute the 10 most frequent words in the modified corpus.
# What do you observe?

# Tokenize the tweets 
tokenize(df)
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,"[menyrbie, phil, gahan, chrisitv, https, t, co...",Neutral
1,3800,48752,UK,16-03-2020,"[advice, talk, to, your, neighbours, family, t...",Positive
2,3801,48753,Vagabonds,16-03-2020,"[coronavirus, australia, woolworths, to, give,...",Positive
3,3802,48754,,16-03-2020,"[my, food, stock, is, not, the, only, one, whi...",Positive
4,3803,48755,,16-03-2020,"[me, ready, to, go, at, supermarket, during, t...",Extremely Negative


In [19]:
# count the total number of all words
print('2.1. Total number of all words: ')
count_words = count_words_with_repetitions(df)
print(count_words)

# the number of all distinct words
print('2.2. Total number of all distinct words: ')
count_words = count_words_without_repetitions(df)
print(count_words)

# 10 most frequent words
print('2.3. Most frequent words: ')
freq_word = frequent_words(df,10)
print(freq_word)

remove_stop_words(df)
df.head()

stemming(df)
df.head()

# 10 most frequent words
print('2.4. Most frequent words after stop words and stemming: ')
freq_word = frequent_words(df,10)
print(freq_word)

1351476
1207314
['the', 'to', 't', 'co', 'and']


Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
41147,44946,89898,"Brooklyn, NY",14-04-2020,"[shitting, home, covid, coronavirus, toiletpaper]",Negative
41148,44947,89899,,14-04-2020,"[light, sterilizer, sanitizer, mask, mobile, p...",Extremely Positive
41149,44948,89900,"Toronto, Ontario",14-04-2020,"[shocked, number, toronto, supermarket, employ...",Negative
41150,44949,89901,OHIO,14-04-2020,"[situation, amp, world, supermarket, picking, ...",Positive
41151,44950,89902,,14-04-2020,"[mrsilverscott, man, feel, fall, honor, heroes...",Extremely Positive
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,"[airline, pilots, offering, stock, supermarket...",Neutral
41153,44952,89904,,14-04-2020,"[response, complaint, provided, citing, covid,...",Extremely Negative
41154,44953,89905,,14-04-2020,"[tough, kameronwilds, rationing, toilet, paper...",Positive
41155,44954,89906,,14-04-2020,"[wrong, smell, hand, sanitizer, starting, turn...",Neutral
41156,44955,89907,i love you so much || he/him,14-04-2020,"[tartiicat, rift, amazon, normal, market, pric...",Negative


In [23]:
# [13 points] This task can be done individually from the previous three. 
# Store the coronavirus tweets.py corpus in a numpy array and produce a sparse representation of 
# the term document matrix with a CountVectorizer. Next, produce a Multinomial Naive Bayes classifier
# using the provided data set. What is the classifier’s training accuracy? A CountVectorizer allows
# limiting the range of frequencies and number of words included in the term-document matrix.
# Appropriately tune these parameters to achieve the highest classification accuracy you can.

