In [1]:
!pip install kaggle



In [2]:
#Setting Up the kaggle credentials and configuring the path of Kaggle.json
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 90% 73.0M/80.9M [00:00<00:00, 103MB/s] 
100% 80.9M/80.9M [00:00<00:00, 93.2MB/s]


In [4]:
#Extracting the CSV dataset from the zip file
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset,"r") as zipper:
  zipper.extractall()
  print("The dataset has been extracted")

The dataset has been extracted


In [18]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [6]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
#loading data from CSV to panda dataframe
column_names = ['Target','Id','Date','Flag','User','Text']
twitter_data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv",names=column_names,encoding = "ISO-8859-1")

In [8]:
#Checking the distribution of the target column
twitter_data['Target'].value_counts()

Target
0    800000
4    800000
Name: count, dtype: int64

In [9]:
#Replacing the data label 4 to 1
twitter_data.replace({'Target':{4,1}},inplace=True)

In [10]:
# Clean, tokenize, remove stopwords, and stem words to standardize the text.
port_stem = PorterStemmer()
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ', content).lower().split()
  stemmed_cont_final = []
  for word in stemmed_content:
        if word not in stopwords.words('english'):
            stemmed_cont_final.append(port_stem.stem(word))
  return " ".join(stemmed_cont_final)

In [11]:
#Adding the stemmed data into the twitter_data dataframe
twitter_data['Stemmed_Data'] = twitter_data['Text'].apply(stemming) #Takes a lot of time due to the massive size of the data

In [22]:
#Seperating the data and label
X = twitter_data["Stemmed_Data"].values
Y = twitter_data["Target"].values

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [23]:
# Splitting Data into Training set and Testing set
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.15, stratify = Y, random_state= 5)

In [24]:
# Converting the Textual Data into numerical data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [26]:
#Training the Machine Learning Model
model = LogisticRegression(max_iter = 1000)
model.fit(X_train,Y_train)

In [27]:
#Accuracy Score on the training data
X_train_prediction = model.predict(X_train)
training_accuracy = accuracy_score(Y_train, X_train_prediction)
print("Accuracy score on the training data", training_accuracy)

Accuracy score on the training data 0.810227205882353


In [28]:
#Accuracy on the test data
X_test_prediction = model.predict(X_test)
testing_accuracy = accuracy_score(Y_test, X_test_prediction)
print("Accuracy score on the testing data", testing_accuracy)

Accuracy score on the testing data 0.7774916666666667


In [30]:
#Saving the Trained Model
import pickle
filename = "trainedmodel.sav"
pickle.dump(model, open(filename,'wb'))


In [31]:
#Using the saved model for future predictions
loaded_model = pickle.load(open("/content/trainedmodel.sav",'rb'))

In [38]:
def Sentiment_checker(tweet):
  Example = vectorizer.transform([tweet])
  predict = loaded_model.predict(Example)
  if predict == 0:
    print("Negative tweet")
  else:
    print("Positive tweet")

In [2]:
#Checking if the model works overall
Sentiment_checker("I love this. this is the best")
Sentiment_checker("I hate this. this is shit")

NameError: name 'Sentiment_checker' is not defined

In [None]:
from google.colab import drive
drive.mount('/content/drive')