In [1]:
MY_UNIQNAME = 'yuqin'

In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import spacy
import os
from os import path
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter
from wordcloud import WordCloud
import pathlib
import json

# Filter all warnings.
import warnings
warnings.filterwarnings('ignore')

# loading up the language model: English
nlp = spacy.load('en')

In [5]:
# Use X_train, X_test, y_train, y_test for all of the following questions
from sklearn.model_selection import train_test_split

df = pd.read_csv('training.csv',encoding='latin-1',names=['polarity','id','date','query','user','text'])
df = df.iloc[780000:820001,:]
df.polarity.unique()

array([0, 4])

In [6]:
document = df['text']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

def text_clean(book_text):
    
    # Convert to lowercase
    text_low = book_text.lower()
    
    # Remove punctuation and any other non-alphabet characters
    text_low_no_num = re.sub(r'[0-9]', '', text_low)
    
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' # list of special characters you want to exclude
    text_low_no_num_no_punc = ""
    for char in text_low_no_num:
        if char not in punctuations:
            text_low_no_num_no_punc = text_low_no_num_no_punc + char
            
    clean_text = text_low_no_num_no_punc.replace(os.linesep, "")
    
    # stop words
    text_nonstop = ""
    words = clean_text.split()
    for word in words:
        if word not in STOP_WORDS:
            text_nonstop = text_nonstop + " " + word 
    
    return text_nonstop


In [8]:

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.8, max_features=200000,
    min_df=0.2, stop_words='english',
    use_idf=True, tokenizer=text_clean, ngram_range=(1,3)
)

%time tfidf_matrix = tfidf_vectorizer.fit_transform(document)

print(tfidf_matrix.shape)

CPU times: user 3.06 s, sys: 73.7 ms, total: 3.13 s
Wall time: 3.13 s
(40001, 65)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:
from sklearn.decomposition import PCA
reduced_data = PCA(n_components=5).fit_transform(dist)

In [None]:
# # Use seaborn's .clustermap() function to draw a hierarchically-clustered heatmap
sns.clustermap(reduced_data)

In [None]:
from sklearn.decomposition import PCA
reduced_data = PCA(n_components=3).fit_transform(dist)
# # Use seaborn's .clustermap() function to draw a hierarchically-clustered heatmap
sns.clustermap(reduced_data)

In [None]:
from sklearn import metrics
from sklearn.cluster import KMeans

# Insert your code here

# Rule of Thumb 
k = np.sqrt(30000/2)
print(k)

# two different cost functions 
# elbow
elbow_score = []
for k in range(2,20):
    k_means = KMeans(init='k-means++', n_clusters=k, n_init=5)
    k_means.fit(reduced_data)
    elbow_score.append(k_means.inertia_)

# sihouette
sihouette_score = []
for k in range(2,20):
    k_means = KMeans(init='k-means++', n_clusters=k, n_init=5)
    k_means.fit(reduced_data)
    sihouette_score.append(metrics.silhouette_score(reduced_data, k_means.labels_))

score = pd.DataFrame()
score['elbow'] = elbow_score
score['sihouette'] = sihouette_score

score
fig, ax = plt.subplots()
fig.set_size_inches(5,8)
plt.plot(range(2,20), score['elbow'], 'b*-')
plt.xlim(1, plt.xlim()[1])

fig, ax = plt.subplots()
fig.set_size_inches(5,8)
plt.plot(range(2,20), score['sihouette'], 'b*-')
plt.xlim(1, plt.xlim()[1])

In [None]:
score
fig, ax = plt.subplots()
fig.set_size_inches(8,5)
plt.plot(range(2,20), score['elbow'], 'b*-')
plt.xlim(1, plt.xlim()[1])

fig, ax = plt.subplots()
fig.set_size_inches(8,5)
plt.plot(range(2,20), score['sihouette'], 'b*-')
plt.xlim(1, plt.xlim()[1])

In [None]:
# Enter your code here
from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(reduced_data)

from scipy.cluster.hierarchy import fcluster
fcluster = fcluster(linkage_matrix, 3, criterion='maxclust')

In [None]:
i = 0
cluster_list = []
for cluster in fcluster:
    cluster_list.append((i,cluster))
    i +=1

In [None]:
one = ""
two = ""
three = ""

for item in cluster_list:
    if item[-1] == 1:
        one = one + document[item[0]]
    if item[-1] == 2:
        two = two + document[item[0]]
    if item[-1] == 3:
        three = three + document[item[0]]

In [None]:
for cluster in [one,two,three]:
    wordcloud = WordCloud().generate(cluster)
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud)
    plt.axis("off")   
    plt.show()

In [None]:
# one pos tag
from collections import Counter
one_counts = Counter(one.split()).most_common(100)
one_counts[50:100]

In [None]:
one_nlp = nlp(one[:800000])
one_noun = list()
one_ad = list()
one_verb = list()
for i, sent in enumerate(one_nlp.sents):
    for token in sent:
        if token.pos_ == 'NOUN':
            one_noun.append(str(token))
        if token.pos_ == 'VERB':
            one_verb.append(str(token))
        if token.pos_ == 'ADJ':
            one_ad.append(str(token))

In [None]:
# Counter is useful for string
one_noun_counts = Counter(one_noun).most_common(50)
one_noun_counts

In [None]:
one_ad_counts = Counter(one_ad).most_common(50)
one_ad_counts

In [None]:
one_verb_counts = Counter(one_verb).most_common(50)
one_verb_counts

In [None]:
# two pos tag
two_counts = Counter(two.split()).most_common(100)
# two_counts[50:100]
two_counts[0:50]

In [None]:
two_nlp = nlp(two)
two_noun = list()
two_ad = list()
two_verb = list()
for i, sent in enumerate(two_nlp.sents):
    for token in sent:
        if token.pos_ == 'NOUN':
            two_noun.append(str(token))
        if token.pos_ == 'VERB':
            two_verb.append(str(token))
        if token.pos_ == 'ADJ':
            two_ad.append(str(token))

In [None]:
two_pos_count = pd.DataFrame()
two_noun_counts = Counter(two_noun).most_common(50)
two_pos_count['noun']=two_noun_counts
two_ad_counts = Counter(two_ad).most_common(50)
two_pos_count['adj'] = two_ad_counts
two_verb_counts = Counter(two_verb).most_common(50)
two_pos_count['verb'] = two_verb_counts
two_pos_count

In [None]:
# three pos tag
three_counts = Counter(three.split()).most_common(100)
# three_counts[50:100]
three_counts[0:50]

In [None]:
three_nlp = nlp(three)
three_noun = list()
three_ad = list()
three_verb = list()
for i, sent in enumerate(three_nlp.sents):
    for token in sent:
        if token.pos_ == 'NOUN':
            three_noun.append(str(token))
        if token.pos_ == 'VERB':
            three_verb.append(str(token))
        if token.pos_ == 'ADJ':
            three_ad.append(str(token))

In [None]:
three_pos_count = pd.DataFrame()
three_noun_counts = Counter(three_noun).most_common(50)
three_pos_count['noun']=three_noun_counts
three_ad_counts = Counter(three_ad).most_common(50)
three_pos_count['adj'] = three_ad_counts
three_verb_counts = Counter(three_verb).most_common(50)
three_pos_count['verb'] = three_verb_counts
three_pos_count

In [None]:
three_noun_counts

In [None]:
three_ad_counts

In [None]:
three_verb_counts

In [None]:
one = ""
two = ""
three = ""

for item in cluster_list:
    if item[-1] == 1:
        one = one + document[item[0]]
    if item[-1] == 2:
        two = two + document[item[0]]
    if item[-1] == 3:
        three = three + document[item[0]]
def topic_tweets(cluster,topic,number):
    one_nlp = nlp(cluster[:800000])
    rain = " "
    one_noun = list()
    one_ad = list()
    one_verb = list()
    for i, sent in enumerate(one_nlp.sents):
        for token in sent:
            if str(token) == topic:
                rain = rain + str(sent)
    rain_nlp = nlp(rain)
    rain_noun = list()
    rain_ad = list()
    rain_verb = list()
    for i, sent in enumerate(rain_nlp.sents):
        for token in sent:
            if token.pos_ == 'NOUN':
                rain_noun.append(str(token))
            if token.pos_ == 'VERB':
                rain_verb.append(str(token))
            if token.pos_ == 'ADJ':
                rain_ad.append(str(token))
    rain_pos_count = pd.DataFrame()
    rain_noun_counts = Counter(rain_noun).most_common(number)
    rain_pos_count['noun']=rain_noun_counts
    rain_ad_counts = Counter(rain_ad).most_common(number)
    rain_pos_count['adj'] = rain_ad_counts
    rain_verb_counts = Counter(rain_verb).most_common(number)
    rain_pos_count['verb'] = rain_verb_counts
    return rain_pos_count

In [None]:
rain = topic_tweets(one,'rain',20)
rain

In [None]:
rain_two = topic_tweets(two,'rain',20)
rain_two

In [None]:
rain_three = topic_tweets(three,'rain',20)
rain_three

In [None]:
weekend = topic_tweets(one,'weekend',20)
weekend

In [None]:
two_weekend = topic_tweets(two,'weekend',10)
two_weekend

In [None]:
three_weekend = topic_tweets(three,'weekend',20)
three_weekend

In [None]:
headache = topic_tweets(one,'headache',20)
headache

In [None]:
headache = topic_tweets(two,'headache',10)
headache

In [None]:
headache = topic_tweets(three,'headache',20)
headache

In [None]:
work = topic_tweets(one,'work',20)
work

In [None]:
work = topic_tweets(two,'work',20)
work

In [None]:
work = topic_tweets(three,'work',20)
work

In [None]:
but = topic_tweets(one,'but',20)
but

In [None]:
but = topic_tweets(two,'but',20)
but

In [None]:
but = topic_tweets(three,'but',20)
but

In [None]:
night = topic_tweets(one,'night',20)
night

In [None]:
night = topic_tweets(two,'night',10)
night

In [None]:
night = topic_tweets(three,'night',20)
night

In [None]:
# (Naive Bayes, Maximum Entropy, and SVM)
# LogisticRegression
# df = pd.read_csv('training.csv',encoding='latin-1',names=['polarity','id','date','query','user','text'])
# df = df.iloc[100000:,:]
# df.polarity.unique()

In [None]:
# add topic label
def add_topics(document,df,topic):
    topic_labels = []
    for tweet in document:
        label = 0
        tweet=text_clean(tweet)
        for token in tweet.split():
            if token == topic:
                label = 1
        topic_labels.append(label)
    
    df[topic]=topic_labels
    return df

In [None]:
document = df['text']
for i in ["weather","rain","sun","work","job","homework","school","kids","friends","he","her","she","him","sleep","phone"]:
    df = add_topics(document,df,i)

In [None]:
df['tfidf']= dist

In [None]:
X = df.iloc[:,6:]
y = df.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# always predicts the most frequent label in the training set.
clf = DummyClassifier(strategy='most_frequent',random_state=0)
clf.fit(X_train, y_train)
# Returns the mean accuracy on the given test data and labels.
# Estimate the accuracy of the classifier on future data, using the test data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred,normalize = True)
# Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
recall = recall_score(y_test, y_pred, average='macro')
print (accuracy,recall)

In [None]:
from sklearn.model_selection import cross_val_score
import sklearn.ensemble as skens
import sklearn.metrics as skmetric
import sklearn.naive_bayes as sknb
import sklearn.tree as sktree
# fold = 10
rf_model_10 = skens.RandomForestClassifier(n_estimators=10,oob_score=True, criterion='entropy')
rf_model_10.fit(X_train,y_train)

print("For test dataset: ", rf_model_10.score(X_test, y_test))
feat_importance_10 = rf_model_10.feature_importances_
# pd.DataFrame({'Feature Importance':feat_importance},
#             index=df_mb_train.columns[:-1]).plot(kind='barh')

In [None]:
# cross-validation by GridSearch
from sklearn.model_selection import GridSearchCV
param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50,55,60,65,70,75,80],
                 'max_depth': range(1,30)
             }
rf_model = skens.RandomForestClassifier()
grid_clf = GridSearchCV(rf_model, param_grid, cv=10)
grid_clf.fit(X_train,y_train)
print(grid_clf.best_estimator_)
print(grid_clf.best_params_)
print(grid_clf.best_score_)

In [None]:
# train the model
gnb_model = sknb.GaussianNB()
gnb_model.fit(X_train,y_train)

print("For test dataset: ", gnb_model.score(X_test, y_test))

In [None]:
# most important feature is URL_Length
# gnb_model_mf = sknb.GaussianNB()
# gnb_model_mf.fit(df_mb_train[['URL_LENGTH']],df_mb_train.Type)
# print("For validation dataset: ", gnb_model_mf.score(df_mb_validation[['URL_LENGTH']], df_mb_validation.Type))
# print("For test dataset: ", gnb_model_mf.score(df_mb_test_scaled[['URL_LENGTH']], df_mb_test_scaled.Type))

In [None]:
# def answer_six():    
#     # YOUR CODE HERE
#     clf = LogisticRegression()
#     grid_values = {'C': [0.01, 0.1, 0.5, 1, 10, 100], 'penalty': ['l1', 'l2']}
#     grid_search = GridSearchCV(clf, param_grid=grid_values, scoring='recall')
#     grid_search.fit(X_train, y_train)
#     cv_result = grid_search.cv_results_
#     mean_test_score = cv_result['mean_test_score']
#     result = np.array(mean_test_score).reshape(6,2)
#     return result
# answer_six()

In [None]:
# def GridSearch_Heatmap(scores):
#     %matplotlib inline
#     import seaborn as sns
#     import matplotlib.pyplot as plt
#     plt.figure()
#     sns.heatmap(scores.reshape(6,2), xticklabels=['l1','l2'], yticklabels=[0.01, 0.1, 0.5, 1, 10, 100])
#     plt.yticks(rotation=0);
# GridSearch_Heatmap(answer_six())