In [1]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_val_score,KFold,GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("classified_data.csv",encoding='utf-8')
df=df.iloc[0:3000,:]
df.shape

(3000, 3)

### Function to remove punctuation

In [3]:
def removePunch(s):
    list=[]
    for i in s:
        if(i not in string.punctuation):
            list.append(i)
    ss=''.join(list)        
    return ss      

In [4]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Function to remove stopwords

In [5]:
def removeStopWords(s):
    list2=[]
    split_list=s.split()
    for w in split_list:
        if(w not in stopwords.words("english")):
            list2.append(w)
    ss=' '.join(list2)        
    return ss 

In [6]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

### Function to perform Lemmatizing...
### Lemmatizer has advantage over stemmer as stemmer converts to some meaningless words

In [7]:
def Lemmatizing(s):
    lemmatizer = WordNetLemmatizer()
    list=s.split()
    list2=[]
    for word in list:
        ss=lemmatizer.lemmatize(word)
        list2.append(ss)
    ss=' '.join(list2)    
    return ss

In [8]:
def Stemming(s):
    stemmer = PorterStemmer()
    list=s.split()
    list2=[]
    for word in list:
        ss=stemmer.stem(word)
        list2.append(ss)
    ss=' '.join(list2)    
    return ss

## Make Pipeline

In [9]:
def cleanText(x):
    x=x.lower()
    x=re.sub(r'^rt','',x)  #Remove RT from tweets which stand for Retweet
    x=re.sub('\.\.+',' ',x)  #Replacing two or more dots with a space
    x=re.sub('\s+$','',x)   #Removes spaces from the end of the string
    x=re.sub('[\'\"]$','',x)  #Removes " and ' from the end of the string
    x=re.sub('\s\s+',' ',x)   #Remove two or more spaces with a space
    x=re.sub('((www\.[\S]+)|(https?://[\S]+))','url',x)   #Replace URLs with the word URL
    x=re.sub('@[\S]+','user',x)
    x=re.sub('(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))','positive',x)
    x=re.sub(':\s?D|:-D|x-?D|X-?D','positive',x)
    x=re.sub(':\s?\(|:-\(|\)\s?:|\)-:','positive',x)
    x=re.sub('<3|:\*','positive',x)
    x=re.sub(':\s?\(|:-\(|\)\s?:|\)-:','negative',x)
    x=re.sub('(:,\(|:\'\(|:"\()','negative',x)
    x=removeStopWords(x)
    x=removePunch(x)
    #x=Lemmatizing(x)
    x=Stemming(x)
    #x=re.sub(r'.+@[^\.].*\.[a-z]{2,}$','emailaddress')            Replace email addresses with 'email'
    #x=re.sub(r'£|\$', 'moneysymb')            Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
    #x=re.sub(r'[\d-]{10}','phonenumbr')        Replace 10 digit phone numbers with 'phonenumber'
    #x=re.sub(r'\d+(\.\d+)', 'number')       Replace numbers with number
    #x=re.sub(r'\b(\w+)( \1\b)+', r'\1',x)
    #x=re.sub(r'[a-zA-Z][a-zA-Z][a-zA-Z]+','',x)
    '''x=re.sub(r'[^a-zA-Z]',' ',x)
    x=re.sub(r'\s+[a-z]\s+',' ',x)   #remove single character
    
    x=re.sub(r'\s',' ',x)'''         #remove extra spaces
    return x

In [10]:
cleanText("RT ..Iam as.... I am positive :)* I am wearing t-shirt I am sooo smart. i like play playing player I own https://www.programiz.com/python-programming/regex @abccdd  'abc'<<//#abc'")

'iam posit posit wear tshirt sooo smart like play play player url user abcabc'

### Function for countervectorizer

In [74]:
def countervectorized(df):
    cv=CountVectorizer(min_df=10)
    a1=cv.fit_transform(df)
    a2=a1.toarray()
    return a2

### Function for TFIDFvectorizer

In [75]:
def TFIDFVectorized(df):
    tfidf=TfidfVectorizer(min_df=10,)
    a1=tfidf.fit_transform(df)
    a2=a1.toarray()
    return a2

In [76]:

features=df.SentimentText.apply(cleanText)
features.head()

0                                       sad apl friend
1                                miss new moon trailer
2                                    omg alreadi 730 o
3    omgaga im sooo im gunna cri ive dentist sinc 1...
4                              think mi bf cheat me tt
Name: SentimentText, dtype: object

In [77]:
target=df['Sentiment']
target=target.values
target

array([0, 0, 1, ..., 0, 1, 0], dtype=int64)

### Counter Vectorizing and TFIDF Vectorizing the Sentiment Text

In [78]:
feature_CV=countervectorized(features)
feature_TFIDF=TFIDFVectorized(features)
feature_TFIDF

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [79]:
x_train,x_test,y_train,y_test = train_test_split(feature_TFIDF,target, test_size = 0.20,random_state=10)

In [80]:
def cm(y_test,pred):
    c=confusion_matrix(y_test, pred)
    print(c)

### Grid search = Parameter tuning + Cross validation

In [81]:
kf = KFold(n_splits=5,shuffle=True)
svc_score=cross_val_score(SVC(kernel='linear'),feature_TFIDF,target,cv=kf,scoring='accuracy')
para_grid = {'C':10.** np.arange(-3,3),
            'gamma':10.** np.arange(-5,0)}
grid_search = GridSearchCV(SVC(),para_grid,cv=kf)

In [82]:
grid_search.fit(x_train,y_train)

KeyboardInterrupt: 

In [None]:
grid_search.best_params_

In [None]:
grid_search.score(x_test,y_test)

In [None]:
clf=grid_search.best_estimator_

In [None]:
import pickle   # Using pickle we can import save the trained model so that we dont't have to run it again and again..

In [None]:
filename = 'twitter_model.pkl'
f=open(filename, 'wb')
pickle.dump(clf,f)
 
f.close()

In [None]:
filename = 'twitter_model.pkl'
clf2 = pickle.load(open(filename, 'rb'))