In [12]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk 
from nltk import pos_tag_sents, word_tokenize, pos_tag
from nltk import word_tokenize
# Load Text Cleaning Pkgs
# !pip install neattext
# !pip install scikit-multilearn
import neattext.functions as nfx
import neattext as nt
import re
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, hamming_loss
from sklearn.model_selection import train_test_split
import matplotlib
import seaborn as sns

# classifiers
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.problem_transform import LabelPowerset, ClassifierChain, BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")
STOPLIST = set(stopwords.words('english'))

In [2]:
df_train=pd.read_csv("dataset1.csv")
print("Shape Before cleaning:", df_train.shape)
df_train.drop_duplicates(subset=['text'], inplace=True)
df_train[pd.isnull(df_train).any(axis=1)]
df_train = df_train.interpolate()  # Interpolate our data to get rid of null values
print("Shape After cleaning:", df_train.shape)
df_train.head()

Shape Before cleaning: (20000, 10)
Shape After cleaning: (19464, 10)


Unnamed: 0,text,anger,fear,joy,love,sadness,surprise,thankfulness,disgust,guilt
0,What can happen to you if your depressed #depr...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,@user I am for my family s complete lack of in...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,jennifer reyna haha I know ! I was trying to c...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,@user Thanks for helping with @user They upgra...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,No stroller or baby dog headed to hike #freedom,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [3]:
N = 9
df_train['text'] = df_train['text'].astype(str)
y = df_train.iloc[: , -N:]

In [4]:
print ("Noise in every text before cleaning")
df_train['text'].apply(lambda x:nt.TextFrame(x).noise_scan())

Noise in every text before cleaning


0        {'text_noise': 8.080808080808081, 'text_length...
1        {'text_noise': 12.162162162162163, 'text_lengt...
2        {'text_noise': 11.11111111111111, 'text_length...
3        {'text_noise': 11.904761904761903, 'text_lengt...
4        {'text_noise': 6.382978723404255, 'text_length...
                               ...                        
19995    {'text_noise': 14.285714285714285, 'text_lengt...
19996    {'text_noise': 9.803921568627452, 'text_length...
19997    {'text_noise': 3.389830508474576, 'text_length...
19998    {'text_noise': 4.615384615384616, 'text_length...
19999    {'text_noise': 15.909090909090908, 'text_lengt...
Name: text, Length: 19464, dtype: object

In [5]:
corpus = df_train['text'].apply(nfx.remove_stopwords)
corpus = corpus.apply(nfx.remove_punctuations)
corpus = corpus.apply(nfx.remove_userhandles)

print ("Noise after cleaning")
corpus.apply(lambda x:nt.TextFrame(x).noise_scan())

Noise after cleaning


0        {'text_noise': 0, 'text_length': 66, 'noise_co...
1        {'text_noise': 0, 'text_length': 47, 'noise_co...
2        {'text_noise': 0, 'text_length': 76, 'noise_co...
3        {'text_noise': 0, 'text_length': 37, 'noise_co...
4        {'text_noise': 0, 'text_length': 38, 'noise_co...
                               ...                        
19995    {'text_noise': 0, 'text_length': 40, 'noise_co...
19996    {'text_noise': 0, 'text_length': 20, 'noise_co...
19997    {'text_noise': 0, 'text_length': 106, 'noise_c...
19998    {'text_noise': 0, 'text_length': 40, 'noise_co...
19999    {'text_noise': 0, 'text_length': 21, 'noise_co...
Name: text, Length: 19464, dtype: object

In [6]:
tfidf = TfidfVectorizer()
Xfeatures = tfidf.fit_transform(corpus).toarray()
X_train, X_test, y_train, y_test = train_test_split(Xfeatures, y, test_size=0.3, random_state=42)

In [7]:
def build_model(model,mlb_estimator,xtrain,ytrain,xtest,ytest):
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    clf_predictions = clf.predict(xtest)
    acc = accuracy_score(ytest,clf_predictions)
    # Hamming loss to determine the fraction of incorrect predictions of a given model. 
    ham = hamming_loss(ytest,clf_predictions)  
    result = {"accuracy:":acc, "hamming_score":ham}
    return result

## Classification

In [31]:
# Multinomial Naive Bayes Classifier

# Multi-class classifier is trained on all the possible label combinations in our data
mnb_model = build_model(MultinomialNB(), LabelPowerset, X_train, y_train, X_test, y_test)
print("Multinomial Naive Bayes Classifier", mnb_model)

Multinomial Naive Bayes Classifier {'accuracy:': 0.5815068493150685, 'hamming_score': 0.10013318112633181}


In [36]:
# Random Forest Classifier
rf_model = build_model(RandomForestClassifier(n_estimators=9), LabelPowerset, X_train, y_train, X_test, y_test)
print("Random Forest Classifier", rf_model)

Random Forest Classifier {'accuracy:': 0.6217465753424658, 'hamming_score': 0.09084855403348555}


In [32]:
# Logistic regression

lg_model = build_model(LogisticRegression(solver='lbfgs', multi_class='multinomial', penalty='l2', C=1.0, max_iter=200), 
                        LabelPowerset, X_train, y_train, X_test, y_test)
print("Logistic regression Classifier", lg_model)

Logistic regression Classifier {'accuracy:': 0.649486301369863, 'hamming_score': 0.08310502283105023}


In [8]:
# Linear SVM classification

svc_model = build_model(LinearSVC(tol=1e-05), LabelPowerset, X_train, y_train, X_test, y_test)
print("Linear SVC Classifier", svc_model)

Linear SVC Classifier {'accuracy:': 0.675513698630137, 'hamming_score': 0.0764269406392694}


In [10]:
# Making a single prediction from dataset 1
print(y.columns.values)
for i in range(15):
    ex1 = df_train['text'].iloc[i]
    vec_example = tfidf.transform([ex1])
    binary_rel_clf = BinaryRelevance(LinearSVC(tol=1e-05))
    binary_rel_clf.fit(X_train,y_train)
    output = binary_rel_clf.predict(vec_example).toarray()
    print (output)

['anger' 'fear' 'joy' 'love' 'sadness' 'surprise' 'thankfulness' 'disgust'
 'guilt']
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 1. 0. 0. 0. 0.]]
[[0. 0. 0. 1. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 1. 0. 0.]]
[[0. 0. 1. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 1. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 1. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 1. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 1. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 1. 0. 0.]]
[[0. 1. 1. 1. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]


## Making a single prediction from dataset 2

In [27]:
df_test = pd.read_csv('dataset2.csv', sep='\t', names= ['text'])
df_test.drop_duplicates(subset=['text'], inplace=True)
corpus1 = df_test['text'].apply(nfx.remove_stopwords)
corpus1 = corpus1.apply(nfx.remove_punctuations)
corpus1 = corpus1.apply(nfx.remove_userhandles)

In [31]:
for i in range(15):
    ex1 = corpus1.iloc[i]
    vec_example = tfidf.transform([ex1])
    binary_rel_clf = BinaryRelevance(LinearSVC(tol=1e-05))
    binary_rel_clf.fit(X_test,y_test)
    print(binary_rel_clf.predict(vec_example).toarray())


[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 1. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 1. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.]]
