In [1]:
import string 
import cufflinks as cf
import seaborn as sns
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
%matplotlib inline
init_notebook_mode(connected=True)
cf.go_offline()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn import tree
import warnings
warnings.filterwarnings("ignore")
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

ModuleNotFoundError: No module named 'wordcloud'

## Importing dataset


In [None]:
df =  pd.read_csv("data.csv")
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df['label'].unique()

In [None]:
df['label'].value_counts()

In [None]:
df['label'].describe()

## Removing stop words

In [None]:
from sklearn.feature_extraction import text
print(text.ENGLISH_STOP_WORDS)

In [None]:
stopwords = ['a', 'about', 'above', 'across', 'after', 'again', 'all', 'almost', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'being', 'below', 'beside', 'besides', 'between', 'bill', 'both', 'but', 'by', 'call', 'can', 'co', 'con','de', 'describe', 'detail', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere',  'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'into', 'is', 'it', 'its', 'itself', 'ltd',  'nine', 'no', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'ten', 'than', 'that', 'the', 'their', 'them', 'themselves', 'then', 'there', 'thereafter',  'therein', 'thereupon', 'these', 'they', 'thick', 'third', 'this', 'those', 'three', 'through', 'thru', 'to','too', 'twelve', 'twenty', 'two', 'un', 'up', 'upon', 'us', 'via', 'was', 'we',  'were',  'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whole', 'whom', 'whose', 'will', 'with', 'within', 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves']

In [None]:
df['tweet'].apply(lambda x: len(x.split())).sum()

In [None]:
print(f'Input data has {len(df)} rows, {len(df.columns)} columns')

## Removing punctuations

In [None]:
string.punctuation

In [None]:
def remove_punctuation(txt):
    txt_nopunct = "".join([c for c in txt if c not in string.punctuation])
    return txt_nopunct

In [None]:
df['msg_clean'] = df['tweet'].apply(lambda x:remove_punctuation(x))
df.head()

## Tokenization

In [None]:
def tokenize(txt):
    tokens = re.split('\W+',txt)
    return tokens

df['msg_clean_tokenized'] = df['msg_clean'].apply(lambda x: tokenize(x.lower()))

df.head()

In [None]:
def remove_stopwords(txt_tokenized):
    txt_clean = [word for word in txt_tokenized if word not in stopwords]
    return txt_clean

df['msz_no_stopwords'] = df['msg_clean_tokenized'].apply(lambda x: remove_stopwords(x))
df.head()

## Steamming

In [None]:
ps=PorterStemmer()

def stemming(tokenized_txt):
    text = [ps.stem(word) for word in tokenized_txt]
    return text


In [None]:
df['msg_stemmed'] = df['msz_no_stopwords'].apply(lambda x: stemming(x))
df.head()

## Lemmatization

In [None]:
wn = nltk.WordNetLemmatizer()

def lemmatization(token_txt):
    text = [wn.lemmatize(word) for word in token_txt]
    return text

In [None]:
df['msg_lemmatized'] = df['msz_no_stopwords'].apply(lambda x: lemmatization(x))
df.head()

In [None]:
df['sentence']=df['msg_lemmatized'].apply(' '.join)
df.head()

## Data visualization

In [None]:
sns.countplot(x='label',data=df)

In [None]:
df.iplot()

In [None]:
df.count().iplot(kind='bar')

## Linear SVC

In [None]:
X=df['sentence']
y=df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
vectorizer = TfidfVectorizer()
X_train_tfvect = vectorizer.fit_transform(X_train)
X_train_tfvect.shape

In [None]:
clf = LinearSVC()
clf.fit(X_train_tfvect,y_train)

In [None]:
text_clf=Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
text_clf.fit(X_train,y_train)

In [None]:
predictions=text_clf.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

In [None]:
print(metrics.classification_report(y_test,predictions))

## Multinomial Naive Bayes

In [None]:
nb_model = MultinomialNB()

nb_model.fit(X_train_tfvect, y_train)

In [None]:
text_clf=Pipeline([('tfidf',TfidfVectorizer()),('nb_model',MultinomialNB())])
text_clf.fit(X_train,y_train)

In [None]:
predictions=text_clf.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

In [None]:
print(metrics.classification_report(y_test,predictions))