In [1]:
dataset='output1.csv'

In [41]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#printing the stopwords in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
#data processing
#loading the data from csv to pandas dataframe
twitter_data = pd.read_csv('output1.csv', encoding='ISO-8859-1')

In [10]:
#checking the number of rows and columns
twitter_data.shape

(5999, 6)

In [11]:
#printing the first five rows of the dataframe
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,4,2187157363,Mon Jun 15 19:52:26 PDT 2009,NO_QUERY,TidyCat,@firedancertat aight - good night twitter - i ...
1,4,1981086700,Sun May 31 08:23:08 PDT 2009,NO_QUERY,iiiccchhhaaa,@LeonnieFM you're very welcome
2,4,2001780956,Tue Jun 02 02:03:03 PDT 2009,NO_QUERY,myatsang,Am watching old hk tv seriesä»æ?¥èªæ±æ¹. M...
3,0,1557914489,Sun Apr 19 06:33:09 PDT 2009,NO_QUERY,Scribbsc,http://twitpic.com/3ldd4 - the last thing I re...
4,0,2192831319,Tue Jun 16 07:36:48 PDT 2009,NO_QUERY,jameane,Really irritated about these phantom updates n...


In [17]:
#naming the columns and reading the dataset again
column_names=['target','id','date','flag','user','text']
twitter_data = pd.read_csv('output1.csv', names=column_names, encoding='ISO-8859-1')

In [18]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,4,2187157363,Mon Jun 15 19:52:26 PDT 2009,NO_QUERY,TidyCat,@firedancertat aight - good night twitter - i ...
2,4,1981086700,Sun May 31 08:23:08 PDT 2009,NO_QUERY,iiiccchhhaaa,@LeonnieFM you're very welcome
3,4,2001780956,Tue Jun 02 02:03:03 PDT 2009,NO_QUERY,myatsang,Am watching old hk tv seriesä»æ?¥èªæ±æ¹. M...
4,0,1557914489,Sun Apr 19 06:33:09 PDT 2009,NO_QUERY,Scribbsc,http://twitpic.com/3ldd4 - the last thing I re...


In [19]:
#one extra column since we added a column
twitter_data.shape

(6000, 6)

In [20]:
#counting the number of missing values in the dataset
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [21]:
#checking the distribution of traget column
twitter_data['target'].value_counts()

4    3018
0    2982
Name: target, dtype: int64

In [22]:
#converting 4 into 1
twitter_data.replace({'target':{4:1}}, inplace=True)

In [23]:
twitter_data['target'].value_counts()

1    3018
0    2982
Name: target, dtype: int64

In [24]:
#0==>negative tweet and 1==> positive tweet

In [25]:
#stemming
port_stem = PorterStemmer()

In [26]:
def stemming(content):
    
    stemmed_content = re.sub('[^a-zA-z]',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    
    return stemmed_content

In [27]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [28]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,1,2187157363,Mon Jun 15 19:52:26 PDT 2009,NO_QUERY,TidyCat,@firedancertat aight - good night twitter - i ...,firedancertat aight good night twitter think s...
2,1,1981086700,Sun May 31 08:23:08 PDT 2009,NO_QUERY,iiiccchhhaaa,@LeonnieFM you're very welcome,leonniefm welcom
3,1,2001780956,Tue Jun 02 02:03:03 PDT 2009,NO_QUERY,myatsang,Am watching old hk tv seriesä»æ?¥èªæ±æ¹. M...,watch old hk tv seri mind numb fun
4,0,1557914489,Sun Apr 19 06:33:09 PDT 2009,NO_QUERY,Scribbsc,http://twitpic.com/3ldd4 - the last thing I re...,http twitpic com ldd last thing rememb


In [29]:
print(twitter_data['stemmed_content'])

0       switchfoot http twitpic com zl awww bummer sho...
1       firedancertat aight good night twitter think s...
2                                        leonniefm welcom
3                      watch old hk tv seri mind numb fun
4                  http twitpic com ldd last thing rememb
                              ...                        
5995                                              go work
5996                                leav soon readi cruis
5997                                        mommi nooooob
5998    markhoppu hey mark hope well wonder know go an...
5999            degrafik one thing learnt sinus pain nose
Name: stemmed_content, Length: 6000, dtype: object


In [30]:
print(twitter_data['target'])

0       0
1       1
2       1
3       1
4       0
       ..
5995    0
5996    1
5997    1
5998    1
5999    0
Name: target, Length: 6000, dtype: int64


In [34]:
#separating the data and label
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [35]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'firedancertat aight good night twitter think surviv monday see tomorrow ^ ^'
 'leonniefm welcom' ... 'mommi nooooob'
 'markhoppu hey mark hope well wonder know go announc blink tour date xx'
 'degrafik one thing learnt sinus pain nose']


In [36]:
print(Y)

[0 1 1 ... 1 1 0]


In [37]:
#spliiting the data into training data and test data
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [39]:
print(X.shape,X_train.shape, X_test.shape)

(6000,) (4800,) (1200,)


In [42]:
#converting the actual data to numerical data

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [43]:
print(X_train)

  (0, 2495)	0.2840983562368176
  (0, 1616)	0.27060308387855875
  (0, 4421)	0.24857206263476367
  (0, 3413)	0.3722198860411007
  (0, 9226)	0.35503728922720496
  (0, 47)	0.27681577231507737
  (0, 8900)	0.32566344595911056
  (0, 6214)	0.27060308387855875
  (0, 8365)	0.3722198860411007
  (0, 5595)	0.19906643296771465
  (0, 4696)	0.17077462715465366
  (0, 7793)	0.2375419161548275
  (1, 5762)	0.2557619861534069
  (1, 4901)	0.1644605224095848
  (1, 875)	0.14834667279576166
  (1, 3749)	0.1209464765932917
  (1, 1764)	0.243955375993116
  (1, 6644)	0.243955375993116
  (1, 6357)	0.2290808063221701
  (1, 309)	0.13558526616210123
  (1, 5591)	0.22377184261151387
  (1, 1781)	0.4581616126443402
  (1, 4979)	0.17891611098845012
  (1, 8574)	0.2557619861534069
  (1, 7077)	0.21928317775165435
  :	:
  (4795, 4872)	0.40982419676588316
  (4795, 5803)	0.3174701738276392
  (4795, 3254)	0.2614510798614556
  (4796, 8380)	0.5741949699805639
  (4796, 3204)	0.38001153676208005
  (4796, 3595)	0.36508969699283134
  (47

In [44]:
print(X_test)

  (0, 7479)	0.3509771121660947
  (0, 7473)	0.3093184293564991
  (0, 5973)	0.3994890777350321
  (0, 5067)	0.3736282297344963
  (0, 3782)	0.3604045751301241
  (0, 3485)	0.35872892365261977
  (0, 3288)	0.26197545156492036
  (0, 504)	0.3933682283173373
  (1, 6193)	0.5805031257323886
  (1, 4578)	0.41435474337321726
  (1, 4377)	0.3928643387659011
  (1, 666)	0.5805031257323886
  (2, 9031)	0.4728575676986458
  (2, 7793)	0.4179955827922301
  (2, 5816)	0.4761723555242701
  (2, 4949)	0.3454395757588064
  (2, 4750)	0.43541634677987906
  (2, 3233)	0.25696187601901693
  (3, 8998)	0.20705449142576843
  (3, 8090)	0.2320288938462945
  (3, 7072)	0.37966019124182027
  (3, 6579)	0.29835478035174195
  (3, 5556)	0.23349543024120128
  (3, 3656)	0.40378325999671416
  (3, 3644)	0.4300015291271975
  :	:
  (1195, 6558)	0.37896513472512716
  (1195, 3727)	0.2743216022077737
  (1195, 2519)	0.3213387542185608
  (1195, 141)	0.3287162664550248
  (1196, 8757)	0.21451732816523278
  (1196, 7198)	0.30522414591126185
  (11

In [45]:
#training the ML model using logistic regression model

model = LogisticRegression(max_iter=1000)

In [46]:
model.fit(X_train,Y_train)

LogisticRegression(max_iter=1000)

In [48]:
#model evaluation

#accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train,X_train_prediction)

In [49]:
print('Accuracy score on training data :',training_data_accuracy)

Accuracy score on training data : 0.8854166666666666


In [50]:
#accuracy score on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test,X_test_prediction)

In [51]:
print('Accuracy score on test data :',test_data_accuracy)

Accuracy score on test data : 0.7208333333333333


In [52]:
#saving the trained model
import pickle
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [53]:
#loading the saved model
loadel_model = pickle.load(open('trained_model.sav', 'rb'))

In [56]:
X_new = X_test[3]
print(Y_test[3])

prediction = model.predict(X_new)
print(prediction)

if(prediction[0] == 0):
    print('Negative tweet')
    
else:
    print('Positive tweet')

0
[0]
Negative tweet
