In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from functions import load_dataset
from functions import remove_unwanted_cols
from functions import preprocess_tweet_text
from functions import get_feature_vector
from functions import int_to_string
from functions import load_model
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from wordcloud import WordCloud
from pandas import option_context
import ftplib
import sched, time
import re

In [2]:
# Load saved cleaned dataset
dataset = load_dataset("cleaned_merge_dataset.csv", ["label", "text"])
display(dataset.groupby('label').count())

Unnamed: 0,label,text
0,0.0,incredible india atulya bharat land seekers beproud plz
1,0.0,near western union want make 300 today hit let walk process
2,1.0,firstdayofschool students teachers good luck successful 2016 17 school year educationmatters ht
3,0.0,kate wrights figure want life
4,1.0,josh jenkins looking forward tab breeders crown super sunday


Data has 2357776 rows, 2 columns.


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
-1.0,821191
0.0,460857
1.0,1067747


In [3]:
# Undersampling
min_count_type = dataset.label.value_counts().min()
sentiment_types = list(dataset.label.unique())
subdatasets = list()
for sentiment_type in sentiment_types :
    dataset_sentiment_type = dataset[dataset['label']==sentiment_type]
    dataset_sentiment_type = dataset_sentiment_type.sample(min_count_type)
    subdatasets.append(dataset_sentiment_type)
dataset_undersampled = pd.concat(subdatasets)
dataset_undersampled.sample(frac=1)
display(dataset_undersampled.groupby('label').count())

dataset = pd.DataFrame(dataset_undersampled)
display(dataset['text'].size)

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
-1.0,458700
0.0,460857
1.0,459111


1382574

In [4]:
# TFIDF matrix, weighting factor
tf_vector = get_feature_vector(np.array(dataset['text'].values.astype('U')).ravel())

In [5]:
# print idf values 
df_idf = pd.DataFrame(tf_vector.idf_, index=tf_vector.get_feature_names(),columns=["idf_weights"]) 

# sort ascending 
display(df_idf.sort_values(by=['idf_weights'])[0:10])
display(df_idf.sort_values(by=['idf_weights'])[1100:1110])

Unnamed: 0,idf_weights
new,4.127788
day,4.213376
get,4.275816
good,4.304319
today,4.34258
love,4.389845
like,4.393865
go,4.482952
got,4.574134
one,4.609364


Unnamed: 0,idf_weights
brothers,7.882456
liked,7.88528
lake,7.88528
asked,7.886696
blessed,7.888823
ideas,7.888823
sadly,7.889533
china,7.889533
university,7.889533
smartphone,7.890954


In [6]:
X = tf_vector.transform(np.array(dataset['text'].values.astype('U')).ravel())
y = np.array(dataset.iloc[:, 0]).ravel()
display(dataset.head(5))
print(X[0:5])

Unnamed: 0,label,text
360230,0.0,want work phoenix az view latest opening job f...
177479,0.0,mars venus earth
1517,0.0,brownies bomb
292712,0.0,hassan aman twitter matte black acura nsx sigh...
289897,0.0,tips fit workout 10


  (0, 393590)	0.1863681359375416
  (0, 384061)	0.19900331896555
  (0, 379734)	0.2897220165307443
  (0, 277425)	0.3603957394762402
  (0, 265908)	0.3060627920051177
  (0, 211656)	0.2771011276578447
  (0, 195604)	0.3096159863978573
  (0, 195550)	0.25251155514008805
  (0, 173437)	0.3194483211949708
  (0, 139949)	0.3680785217261145
  (0, 51439)	0.379773644254873
  (1, 378574)	0.6593526711187568
  (1, 229286)	0.5635484053081756
  (1, 122505)	0.497661782703045
  (2, 72819)	0.7346835090847935
  (2, 67713)	0.6784100098604487
  (3, 369269)	0.14429355019600965
  (3, 343273)	0.3905157430614324
  (3, 334575)	0.37955512431920513
  (3, 323937)	0.31917618596919356
  (3, 259727)	0.3444326313043041
  (3, 230570)	0.24776775696211084
  (3, 168371)	0.32117948837991506
  (3, 93709)	0.26201568547421117
  (3, 64429)	0.15341459237008204
  (3, 39291)	0.33604845754389684
  (3, 32145)	0.3040485333770623
  (4, 393760)	0.5654985595861013
  (4, 359277)	0.5031343846352143
  (4, 140940)	0.5198797988063167
  (4, 2285)	

In [7]:
# Split dataset into Train, Validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.01, random_state=34)

In [8]:
print(X_train.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_valid.shape)

(1368748, 403541)
(13826, 403541)
(1368748,)
(13826,)


In [9]:
# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
print("Naive Bayes training accuracy: " + str(NB_model.score(X_train, y_train)) + "%")
print("Naive Bayes test accuracy: " + str(NB_model.score(X_valid, y_valid)) + "%")

# Training LinearSVC model
LSVC_model = LinearSVC()
LSVC_model.fit(X_train, y_train)
print("LinearSVC training accuracy: " + str(LSVC_model.score(X_train, y_train)) + "%")
print("LinearSVC test accuracy: " + str(LSVC_model.score(X_valid, y_valid)) + "%")

# Saving LSVC model to later use
#file = open('LSVC_model.pickle','wb')
#pickle.dump(LSVC_model, file)
#file.close()

# Training Naive BernoulliNB model
BNB_model = BernoulliNB()
BNB_model.fit(X_train, y_train)
print("BernoulliNB training accuracy: " + str(BNB_model.score(X_train, y_train)) + "%")
print("BernoulliNB test accuracy: " + str(BNB_model.score(X_valid, y_valid)) + "%")

# Training Logistics Regression model
LR_model = LogisticRegression()
LR_model.fit(X_train, y_train)
print("Logistic Regression training accuracy: " + str(LR_model.score(X_train, y_train)) + "%")
print("Logistic Regression test accuracy: " + str(LR_model.score(X_valid, y_valid)) + "%")

Naive Bayes training accuracy: 0.8181337981863718%
Naive Bayes test accuracy: 0.7850426732243599%
LinearSVC training accuracy: 0.866339165427091%
LinearSVC test accuracy: 0.8047880804281788%
BernoulliNB training accuracy: 0.8127639273262866%
BernoulliNB test accuracy: 0.7841024157384637%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression training accuracy: 0.8149133368596703%
Logistic Regression test accuracy: 0.8009547229856792%


In [10]:
# Sentiment graph
def draw_graph(df, topic):
    
    del df['topic_name']
    
    sentiments = df.groupby('sentiment').size()
    
    graph = df.groupby('sentiment').count().rename(columns={'comment_text': ''}).plot(
    autopct= lambda p: '{:.1f}%({:.0f})'.format(p,(p/100)*sentiments.sum()),
    startangle=90, 
    legend = False,
    shadow=False,
    figsize=(6, 6), 
    fontsize=11,
    kind='pie', 
    colors = ['#c90e0e', '#b8b8b8', '#2dbd51'],
    title=topic,
    subplots=True)
    
    fig = plt.gcf()
    fig.savefig("outputs/" + topic + ".png")
    
    plt.close()

In [11]:
s = sched.scheduler(time.time, time.sleep)
pd.options.mode.chained_assignment = None

def send_sentiment_result(sc): 
    
    # Load not labeled test dataset

    session = ftplib.FTP('ftp.smartdiscussionplatform.com','file@smartdiscussionplatform.com','1YHDQ@i9XUl;')
    file = open('comments.csv','wb')                                             # name the file
    session.retrbinary('RETR public_html/matrixes/comments.csv', file.write)     # read the file
    file.close()                                                                        # close file and FTP
    session.quit()
    
    # Read data
    not_labeled_test_ds = pd.read_csv("comments.csv")
    original = not_labeled_test_ds
    
    # Remove column
    column_list = ["topic_name","comment_text"]
    not_labeled_test_ds = not_labeled_test_ds[column_list]
    
    # Creating not labeled test data
    not_labeled_test_ds.comment_text = not_labeled_test_ds["comment_text"].apply(preprocess_tweet_text)
    
    # Features
    not_labeled_test_feature = tf_vector.transform(np.array(not_labeled_test_ds.iloc[:, 1]).ravel())
    
    # Predict
    test_prediction_lsvc = LSVC_model.predict(not_labeled_test_feature)

    # Transform the integer labels to string labels
    test_results = []
    for test in test_prediction_lsvc:
        test_results.append(int_to_string(test))
    
    # Append results to sentiment column
    not_labeled_test_ds['sentiment'] = test_results
    
    # Get topic names
    dfgroup = not_labeled_test_ds.groupby('topic_name')

    topics = []

    for topic,_ in dfgroup:
        temp = re.sub(r'[^\w\s]','',topic)
        topics.append(temp)
        if temp != topic:
            not_labeled_test_ds['topic_name'].loc[not_labeled_test_ds['topic_name'] == topic] = temp
            
    # Store the graphs on the server
    for topic in topics:
        
        temp_df = not_labeled_test_ds.loc[not_labeled_test_ds['topic_name'] == topic]
        draw_graph(temp_df, topic)
    
        session = ftplib.FTP('ftp.smartdiscussionplatform.com','file@smartdiscussionplatform.com','1YHDQ@i9XUl;')
        file = open("outputs/" + topic + ".png","rb")                                       # file to send
        session.storbinary("STOR public_html/matrixes/sentiments/" + topic + ".png", file)     # send the file
        file.close()                                                                 # close file and FTP
        session.quit()
    
    
    # Word Cloud

    plt.ioff()

    plt.figure(figsize = (20,20))
    wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
                   collocations=False).generate(" ".join(list(not_labeled_test_ds['comment_text'])))

    plt.imshow(wc)
    plt.savefig("wordcloud.png", format="PNG", dpi=50, bbox_inches='tight')

    plt.close()
    
    
    session = ftplib.FTP('ftp.smartdiscussionplatform.com','file@smartdiscussionplatform.com','1YHDQ@i9XUl;')
    file = open("wordcloud.png","rb")                                       # file to send
    session.storbinary("STOR public_html/matrixes/wordclouds.png", file)     # send the file
    file.close()                                                                 # close file and FTP
    session.quit()
    
    
    s.enter(10, 1, send_sentiment_result, (sc,))

s.enter(10, 1, send_sentiment_result, (s,))
s.run()

KeyboardInterrupt: 