In [4]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle datasets download -d abhi8923shriv/sentiment-analysis-dataset

Dataset URL: https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset
License(s): CC0-1.0
Downloading sentiment-analysis-dataset.zip to /content
 90% 49.0M/54.4M [00:00<00:00, 109MB/s] 
100% 54.4M/54.4M [00:00<00:00, 111MB/s]


In [7]:
from zipfile import ZipFile
dataset = '/content/sentiment-analysis-dataset.zip'
with ZipFile(dataset,'r' ) as zip:
      zip.extractall()
      print( 'The dataset is extracted' )

The dataset is extracted


In [8]:
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',encoding =('ISO-8859-1'),low_memory =False)
df.shape

(1048572, 6)

In [9]:
df.head(10)

Unnamed: 0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
5,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
6,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
7,0,1467812579,Mon Apr 06 22:20:17 PDT 2009,NO_QUERY,pardonlauren,I just re-pierced my ears
8,0,1467812723,Mon Apr 06 22:20:19 PDT 2009,NO_QUERY,TLeC,@caregiving I couldn't bear to watch it. And ...
9,0,1467812771,Mon Apr 06 22:20:19 PDT 2009,NO_QUERY,robrobbierobert,"@octolinz16 It it counts, idk why I did either..."


<h3>Data preprocessing</h3>

In [10]:
# naming the columns and reading the dataset again
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',names=column_names, encoding = 'ISO-8859-1')

  twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',names=column_names, encoding = 'ISO-8859-1')


In [11]:
twitter_data = twitter_data.drop(index=0)

In [12]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [13]:
twitter_data.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [14]:
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,668925
4,248576
0,131071


In [15]:
twitter_data.replace({'target':{4:1}}, inplace=True) #to convert Positive Tweet Value to 1

In [16]:
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,668925
1,248576
0,131071


In [17]:
def stemming(content):
    corpus = []
    for i in range(0, 1000):
        review = re.sub('[^a-zA-Z]', ' ',content)
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
        review = ' '.join(review)
        return review

In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [20]:
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

## Splitting the dataset into the Training set and Test set

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

In [22]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()

# Fit and transform on the training data
X_train_transformed = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_transformed = vectorizer.transform(X_test)

In [23]:
X_train

array(['btw anyon want old g go drop recycl',
       'rachaell would want stay fiona hut mum alreadi decid pick',
       'vote opinion susan boyl http tinyurl com susanboylepol', ...,
       'amber x ive got wait anoth month xx',
       'thisgoesher june th excit co ill less time use comput',
       'georgieboo aww thank'], dtype=object)

In [24]:
X_test

array(['exam loom make feel realli tens anxiou wait free week',
       'rain ruin day', 'pound cherri ridicul', ...,
       'thetricktolif oz treat well thank unfortunatli leav hour tho still got singapor go tho',
       'xhardcoreyx', 'candi bag almost empti sad haha'], dtype=object)

In [25]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

## Training the Machine Learning Model

In [26]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_transformed, y_train)


In [27]:
y_pred = model.predict(X_test_transformed)
training_data_accuracy = accuracy_score(y_train[:209715], y_pred)
print( 'Accuracy score on the training data :', training_data_accuracy)

Accuracy score on the training data : 0.6791121283646854


In [31]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train_transformed)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print( 'Accuracy score on the training data :', training_data_accuracy)


Accuracy score on the training data : 0.8575049144252238


In [None]:
from transformers import Trainer
Trainer.save_state()
Trainer.train(resume_from_checkpoint='ال path هنا')


# Saving the trained model

In [32]:
import pickle
filename= 'trained_model.sav'
pickle.dump(model,open(filename,'wb'))

In [33]:
# loading the saved model after train
import pickle
loaded_model = pickle.load(open('/content/trained_model.sav','rb'))
X_new = X_test[200]

In [40]:
X_new = X_test_transformed[:70]
print (y_test[200])
# Reshape to have 70 rows and the correct number of features
prediction = loaded_model.predict(X_new)
print(prediction)
for i in prediction:
  if i == 0:
    print('Negative Tweet')
  else:
    print('Positive Tweet')

0
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0
 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Positive Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Positive Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Positive Tweet
Negative Tweet
Negative Tweet
Positive Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Positive Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Positive Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
Negative Tweet
