In [1]:
# Create a directory named `.kaggle` in the home folder (if it doesn't exist)
!mkdir -p ~/.kaggle

# Copy the Kaggle API credentials file (`kaggle.json`) to the `.kaggle` directory
!cp kaggle.json ~/.kaggle/

# Set the file permissions of `kaggle.json` to 600 (readable and writable only by the owner) for security
!chmod 600 ~/.kaggle/kaggle.json


cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [2]:
# Download the Sentiment140 dataset from Kaggle using the Kaggle API.
# The dataset is identified by the dataset owner's username (kazanova) and the dataset name (sentiment140).
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 87% 70.0M/80.9M [00:00<00:00, 158MB/s]
100% 80.9M/80.9M [00:00<00:00, 123MB/s]


In [3]:
from zipfile import ZipFile
# Define the path to the downloaded dataset file (sentiment140.zip).
dataset = '/content/sentiment140.zip'

# Open the zip file in read mode using the ZipFile module.
with ZipFile(dataset, 'r') as zipObj:
    # Extract all the contents of the zip file into the current directory.
    zipObj.extractall()

    # Print a message confirming that the dataset has been successfully extracted.
    print('The dataset is extracted!')


The dataset is extracted!


In [4]:
# Import essential libraries for data manipulation, text preprocessing, and machine learning

import numpy as np  # Library for numerical operations, used for handling arrays and matrices
import pandas as pd  # Library for data manipulation and analysis, used for working with dataframes
import re  # Regular expressions module, used for pattern matching and text cleaning

# Import libraries for natural language processing (NLP)
from nltk.corpus import stopwords  # To import a list of common words (stop words) to remove during text processing
from nltk.stem.porter import PorterStemmer  # A stemmer used to reduce words to their root form (stemming)

# Import machine learning tools
from sklearn.feature_extraction.text import TfidfVectorizer  # Converts text data into a matrix of TF-IDF features
from sklearn.model_selection import train_test_split  # Splits the dataset into training and testing sets
from sklearn.linear_model import LogisticRegression  # Logistic regression model for classification tasks
from sklearn.metrics import accuracy_score  # Function to calculate the accuracy of the model


In [5]:
import nltk  # Import the Natural Language Toolkit (nltk) library for NLP tasks

# Download the stopwords corpus, which contains a list of commonly used words to be excluded from text processing
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
# Print the list of English stopwords, which are commonly removed during text preprocessing
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
# Load the Sentiment140 dataset into a pandas DataFrame from the CSV file.
# The dataset contains 1.6 million pre-processed tweets without emoticons.
# 'ISO-8859-1' encoding is used to correctly read non-ASCII characters in the dataset.
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')


In [8]:
# Get the dimensions of the dataset (number of rows and columns).
twitter_data.shape


(1599999, 6)

In [9]:
# Display the first 5 rows of the dataset to get an overview of its structure and content.
twitter_data.head(5)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [10]:
# Define the column names for the dataset as a list.
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Load the Sentiment140 dataset into a pandas DataFrame, applying the defined column names.
# The dataset is read from the CSV file with 'ISO-8859-1' encoding.
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names=column_names, encoding='ISO-8859-1')


In [11]:
# Display the first 5 rows of the dataset again to confirm the new column names and view the data.
twitter_data.head(5)


Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [12]:
# Get the dimensions of the dataset to verify the number of rows and columns after applying new column names.
twitter_data.shape


(1600000, 6)

In [13]:
# Check for missing values in each column of the dataset by summing null entries.
twitter_data.isnull().sum()

Unnamed: 0,0
target,0
ids,0
date,0
flag,0
user,0
text,0


In [14]:
# Count the occurrences of each unique value in the 'target' column to analyze the distribution of sentiments.
twitter_data['target'].value_counts()


Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [15]:
# Replace the value '4' in the 'target' column with '1' to convert the dataset's sentiment labels.
# This step typically changes the label for positive sentiment from 4 to 1 (1 = positive, 0 = negative).
twitter_data.replace({'target': {4: 1}}, inplace=True)


In [16]:
# Count the occurrences of each unique value in the 'target' column again to verify the changes made to the sentiment labels.
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


In [17]:
# Create an instance of the PorterStemmer, which will be used for stemming words in the text data.
port_stem = PorterStemmer()

In [18]:
def stemming(content):
    # Remove non-alphabetic characters from the text and replace them with spaces.
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)

    # Convert the text to lowercase to ensure uniformity.
    stemmed_content = stemmed_content.lower()

    # Split the text into individual words.
    stemmed_content = stemmed_content.split()

    # Stem each word, excluding stopwords, to reduce them to their root forms.
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]

    # Join the stemmed words back into a single string.
    stemmed_content = ' '.join(stemmed_content)

    return stemmed_content


In [19]:
# Apply the stemming function to the 'text' column of the dataset to create a new column 'stemmed_content'.
# This new column will contain the processed text with cleaned, lowercased, and stemmed words.
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [20]:
# Display the first 5 rows of the dataset to review the newly added 'stemmed_content' column alongside the original text.
twitter_data.head(5)

Unnamed: 0,target,ids,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [21]:
# Print the 'stemmed_content' column of the dataset to review the processed text for all tweets.
print(twitter_data['stemmed_content'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object


In [22]:
# Print the 'target' column of the dataset to review the sentiment labels associated with each tweet.
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [23]:
# Extract the values from the 'stemmed_content' and 'target' columns into NumPy arrays for model training.
X = twitter_data['stemmed_content'].values  # Features: processed tweet texts
Y = twitter_data['target'].values  # Labels: sentiment associated with each tweet

In [24]:
# Print the feature array (X) containing the processed tweet texts to review the data being used for modeling.
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [25]:
# Print the label array (Y) containing the sentiment labels to review the target data corresponding to the tweets.
print(Y)

[0 0 0 ... 1 1 1]


In [26]:
# Split the dataset into training and testing sets using an 80-20 split.
# Stratify the split based on the labels (Y) to maintain the distribution of sentiments in both sets.
# Set a random seed (random_state=2) for reproducibility of results.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [27]:
# Print the shapes of the original feature array (X), training set (X_train), and testing set (X_test)
# to verify the split and confirm the number of samples in each dataset.
print(X.shape, X_train.shape, X_test.shape)

(1600000,) (1280000,) (320000,)


In [28]:
# Initialize the TF-IDF vectorizer, which will convert text data into TF-IDF feature vectors.
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data to learn the vocabulary and IDF values.
vectorizer.fit(X_train)

# Transform the training and testing sets into TF-IDF feature vectors based on the learned vocabulary.
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [29]:
# Print the TF-IDF feature matrix for the training set (X_train) to review the transformed text data.
print(X_train)

  (0, 443066)	0.4484755317023172
  (0, 436713)	0.27259876264838384
  (0, 354543)	0.3588091611460021
  (0, 235045)	0.41996827700291095
  (0, 185193)	0.5277679060576009
  (0, 109306)	0.3753708587402299
  (1, 160636)	1.0
  (2, 443430)	0.3348599670252845
  (2, 433560)	0.3296595898028565
  (2, 409143)	0.15169282335109835
  (2, 407301)	0.18709338684973031
  (2, 406399)	0.32105459490875526
  (2, 288470)	0.16786949597862733
  (2, 266729)	0.24123230668976975
  (2, 178061)	0.1619010109445149
  (2, 150715)	0.18803850583207948
  (2, 132311)	0.2028971570399794
  (2, 129411)	0.29074192727957143
  (2, 124484)	0.1892155960801415
  (2, 109306)	0.4591176413728317
  (2, 77929)	0.31284080750346344
  (3, 411528)	0.27089772444087873
  (3, 406399)	0.29029991238662284
  (3, 388626)	0.3940776331458846
  (3, 172421)	0.37464146922154384
  :	:
  (1279996, 373144)	0.35212500999832036
  (1279996, 318303)	0.21254698865277744
  (1279996, 291078)	0.17981734369155505
  (1279996, 238078)	0.5606696159563151
  (1279996, 2

In [30]:
# Initialize the Logistic Regression model with a maximum of 1000 iterations for convergence.
model = LogisticRegression(max_iter=1000)

In [31]:
# Train the Logistic Regression model using the training data (X_train) and their corresponding labels (Y_train).
model.fit(X_train, Y_train)

In [32]:
# Make predictions on the training data using the fitted Logistic Regression model.
X_train_prediction = model.predict(X_train)

# Calculate the accuracy of the model on the training data by comparing predicted labels with actual labels.
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [33]:
# Print the accuracy score of the model on the training data to evaluate its performance.
print('Accuracy score on the training data: ', training_data_accuracy)

Accuracy score on the training data:  0.8102125


In [35]:
# Make predictions on the testing data using the fitted Logistic Regression model.
X_test_prediction = model.predict(X_test)

# Calculate the accuracy of the model on the testing data by comparing predicted labels with actual labels.
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [37]:
# Print the accuracy score of the model on the testing data to evaluate its performance on unseen data.
print('Test data accuracy: ', test_data_accuracy)

Test data accuracy:  0.778021875


In [38]:
# Import the pickle module, which will be used for saving and loading Python objects, such as trained models.
import pickle

In [39]:
# Define the filename for saving the trained model.
filename = 'trained_model.sav'

# Serialize and save the trained Logistic Regression model to a file using pickle.
pickle.dump(model, open(filename, 'wb'))

In [40]:
# Load the previously saved Logistic Regression model from the file using pickle.
loaded_model = pickle.load(open('/content/trained_model.sav', 'rb'))

In [44]:
# Select a specific sample from the testing set (the 200th sample).
X_new = X_test[200]

# Make a prediction on the selected sample using the loaded model.
prediction = loaded_model.predict(X_new)

# Print the predicted label for the sample.
print(prediction)

# Interpret the prediction: print 'Negative' if the prediction is 0, otherwise print 'Positive'.
if (prediction[0] == 0):
    print('Negative')
else:
    print('Positive')

[1]
Positive
