In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator,TransformerMixin
from collections import defaultdict
from sklearn.linear_model import SGDClassifier
from nltk.corpus import stopwords
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import resample
from sklearn.base import BaseEstimator,TransformerMixin
import math
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Phrases
stopwords = set(stopwords.words('english'))

In [2]:
df = pd.read_csv("full-corpus-training.csv")

In [3]:
df = df[df["Sentiment"] != "irrelevant"]

In [4]:
df.reset_index(inplace = True, drop = True)

In [5]:
df.groupby("Sentiment").count()

Unnamed: 0_level_0,TweetId,TweetText
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,437,437
neutral,2228,2228
positive,329,329


In [6]:
def add_colos(data): # for each docment we will add collocaions of biagrams and triagrams. Like if we have red and wine frequently appeqaring together, we will add red_wine as single word to the document
    text_clean= [data]
    
    bigram = Phrases(text_clean)
    trigram = Phrases(bigram[text_clean])

    for idx in range(len(text_clean)):
        for token in bigram[text_clean[idx]]:
            if '_' in token:
                text_clean[idx].append(token)
        for token in trigram[text_clean[idx]]:           
            if '_' in token:
                text_clean[idx].append(token)
                
    return text_clean[0]

                

In [7]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [8]:
lemmatizer = WordNetLemmatizer()

def text_preprocess(raw): # biagrmas?
    raw = re.sub(r"http://(?:[^/]+/)*.*","",raw).strip()
    raw = re.sub(r"\d+","",raw).strip()
    new = ""
    for i in raw:
        new = new+i if i not in string.punctuation else new
        
    new = list(map(lambda x : x.lower(), new.split()))
    
    new = [word for word in new if word not in stopwords]
    new = [word for word in new if word]
    
    new = add_colos(new)
    
    lemmas = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in new]
    
    return lemmas

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df[["TweetText"]], df["Sentiment"], test_size=0.15, random_state=42)

In [10]:
t = pd.concat([X_train,y_train],axis = 1).reset_index(drop = True)

In [11]:
t.groupby("Sentiment").count()

Unnamed: 0_level_0,TweetText
Sentiment,Unnamed: 1_level_1
negative,364
neutral,1896
positive,284


In [12]:
not_neutral = t[t.Sentiment!="neutral"]
neutral = t[t.Sentiment=="neutral"]

In [13]:
neutral_upsampled = resample(not_neutral,
                          replace=True, # sample with replacement
                          n_samples=len(neutral), # match number in majority class
                          random_state=27) # reproducible results


upsampled = pd.concat([neutral_upsampled, neutral])

In [14]:
neutral_downsampled = resample(neutral,
                                replace = False, # sample without replacement
                                n_samples = len(not_neutral), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([neutral_downsampled, not_neutral])

In [15]:
upsampled.groupby("Sentiment").count()

Unnamed: 0_level_0,TweetText
Sentiment,Unnamed: 1_level_1
negative,1057
neutral,1896
positive,839


In [16]:
downsampled.groupby("Sentiment").count()

Unnamed: 0_level_0,TweetText
Sentiment,Unnamed: 1_level_1
negative,364
neutral,648
positive,284


In [359]:
class multiple_classifiers(BaseEstimator,TransformerMixin):
    
    def __init__(self, clasis):
        self.pipes = clasis
        
        
    def fit(self,classes,x):
        for i in range(len(classes)-1):
            for j in range(i+1, len(classes)):
                one = x[x['Sentiment'] == classes[i]]
                two = x[x['Sentiment'] == classes[j]]
                new = pd.concat([one,two])
                new.reset_index(inplace = True)
                self.pipes[i+j-1].fit(new["TweetText"],new["Sentiment"]) 
        return 
        
    def predict(self,matrix):
        preds = []
        for i in range(len(matrix)):
            row = matrix.loc[[i], ["TweetText"]]
            cur = []        
            for pipe in self.pipes:
                cur.append(pipe.predict(row)[0])
            #print(cur)
            champion =  sorted(cur, key = lambda y : cur.count(y),reverse = True)[0]
            preds.append(champion)
        #print("Done predicting")
        return preds
    
    def __repr__(self):
        return "something"
    def __str__(self):
        return f"model: {self.pipes[0].steps[-1][0]}"

In [20]:
def see_t(actual, preds):
    scores = accuracy_score(y_test,preds)
    scores1 = precision_score(y_test,preds,average="weighted")
    scores2 =  recall_score(y_test,preds,average="weighted")
    scores3 =  f1_score(y_test,preds,average="weighted")
    print(scores,scores1,scores2,scores3)
    return 

In [361]:
m = len(set((df["Sentiment"])))
n = 2

In [362]:
number_of_classifiers = int(math.factorial(m)/(2* math.factorial(m-n)))

In [363]:
def create_pipes(n,choice = "log"):
    res= []
    for i in range(n):
        if choice == "log":
            res.append(Pipeline([("vectorizer", TfidfVectorizer(tokenizer=lambda x : text_preprocess(x))), ("LogisticRegression",  LogisticRegression())]))
        elif choice == "sgd":
            res.append(Pipeline([("vectorizer", TfidfVectorizer(tokenizer=lambda x : text_preprocess(x))), ("SGDClassifier",  SGDClassifier())]))
        elif choice == "svm":
            res.append(Pipeline([("vectorizer", TfidfVectorizer(tokenizer=lambda x : text_preprocess(x))), ("svm.SVC",  svm.SVC(kernel='rbf',random_state=0))]))
        else:
            res.append(Pipeline([("vectorizer", TfidfVectorizer(tokenizer=lambda x : text_preprocess(x))), ("random forest", RandomForestClassifier())]))
    return res


In [364]:
p1 = create_pipes(number_of_classifiers)
p2 = create_pipes(number_of_classifiers,"sgd")
p3 = create_pipes(number_of_classifiers,"svm")
p4 = create_pipes(number_of_classifiers,"forest")

In [365]:
pipes1 = multiple_classifiers(p1)
pipes2 = multiple_classifiers(p2)
pipes3 = multiple_classifiers(p3)
pipes4 = multiple_classifiers(p4)

lst = [pipes1,pipes2,pipes3,pipes4]

In [366]:
def train_analyse(pipes,x,x_test,y_test):
    for pipe in pipes:
        pipe.fit(list(set(x["Sentiment"])),x)
        
    for pipe in pipes:
        preds = pipe.predict(x_test.reset_index(drop=True))
        print(pipe)
        see_t(y_test,preds)
        print()
    return
    
    

In [367]:
train_analyse(lst,downsampled,X_test,y_test)

model: LogisticRegression
0.7377777777777778 0.544316049382716 0.7377777777777778 0.6264506962205173



  _warn_prf(average, modifier, msg_start, len(result))


model: SGDClassifier
0.7377777777777778 0.544316049382716 0.7377777777777778 0.6264506962205173



  _warn_prf(average, modifier, msg_start, len(result))


model: svm.SVC
0.7377777777777778 0.544316049382716 0.7377777777777778 0.6264506962205173



  _warn_prf(average, modifier, msg_start, len(result))


model: random forest
0.7377777777777778 0.544316049382716 0.7377777777777778 0.6264506962205173



  _warn_prf(average, modifier, msg_start, len(result))


In [368]:
train_analyse(lst,upsampled,X_test,y_test)

model: LogisticRegression
0.7377777777777778 0.544316049382716 0.7377777777777778 0.6264506962205173



  _warn_prf(average, modifier, msg_start, len(result))


model: SGDClassifier
0.7377777777777778 0.544316049382716 0.7377777777777778 0.6264506962205173



  _warn_prf(average, modifier, msg_start, len(result))


model: svm.SVC
0.7377777777777778 0.544316049382716 0.7377777777777778 0.6264506962205173



  _warn_prf(average, modifier, msg_start, len(result))


model: random forest
0.1 0.01 0.1 0.01818181818181818



  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
class to_sparse(BaseEstimator,TransformerMixin):
    def __init__(self,vec):
        self.vec = vec
    
    def fit(self,X,y=None):
        self.vec.fit(X)
        return self
    
    def transform(self,X,y=None):
        return self.vec.transform(X).toarray()

In [407]:
ct = to_sparse(TfidfVectorizer(tokenizer=lambda x : text_preprocess(x)))

baes = Pipeline([("vectorizer", ct), ("GaussianNB",  GaussianNB())])

baes.fit(downsampled["TweetText"],downsampled["Sentiment"])

preds = baes.predict(X_test["TweetText"])

see_t(y_test,preds)



0.5222222222222223 0.6947611921037475 0.5222222222222223 0.5654530635531919


In [499]:
ct = to_sparse(TfidfVectorizer(tokenizer=lambda x : text_preprocess(x)))

baes = Pipeline([("vectorizer", ct), ("GaussianNB",  GaussianNB())])

baes.fit(X_train["TweetText"],y_train)

preds = baes.predict(X_test["TweetText"])

see_t(y_test,preds)



0.62 0.6676357886809434 0.62 0.6392088664219812


In [21]:
ct = to_sparse(TfidfVectorizer(tokenizer=lambda x : text_preprocess(x)))
forest = Pipeline([("vectorizer", ct), ("GaussianNB",  RandomForestClassifier())])
forest.fit(X_train["TweetText"],y_train)
preds3 = forest.predict(X_test["TweetText"])

see_t(y_test,preds3)

0.7933333333333333 0.7905072254208622 0.7933333333333333 0.7517478428004744


In [522]:
pipes1.fit(list(set(t["Sentiment"])),t)
preds1 = pipes1.predict(X_test.reset_index(drop=True))
see_t(y_test,preds1)

0.7377777777777778 0.544316049382716 0.7377777777777778 0.6264506962205173


  _warn_prf(average, modifier, msg_start, len(result))


In [414]:
to_predict = pd.read_excel("testing_data.xlsx",header=None,names = ["id","TweetText"])

In [500]:
result = baes.predict(to_predict["TweetText"])
result_df = pd.DataFrame({"Tweets":result})
result_df.to_excel("C://Users/La_Admin/Desktop/out_w3.xlsx",encoding="UTF-8")  

In [523]:
result1 = pipes1.predict(to_predict[["TweetText"]])
result_df1 = pd.DataFrame({"Tweets":result1})

In [524]:
result[50]

'negative'

In [528]:
result1[70]

'neutral'

In [526]:
to_predict["TweetText"][50]

'New macbook is too sick @apple'