In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv("sentiment140.csv", encoding='ISO-8859-1', header=None)
# Check the data
print(df.head())


   0           1                             2         3                4  \
0  0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  _TheSpecialOne_   
1  0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY    scotthamilton   
2  0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY         mattycus   
3  0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          ElleCTF   
4  0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY           Karoli   

                                                   5  
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1  is upset that he can't update his Facebook by ...  
2  @Kenichan I dived many times for the ball. Man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  object
 1   text    1600000 non-null  object
dtypes: object(2)
memory usage: 24.4+ MB


In [None]:
import re

def clean_tweet(text):
    text = re.sub(r"@\w+", "", text)               # Remove usernames
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"#(\w+)", "", text)             # Remove hashtags
    text = re.sub(r"[^\w\s]", "", text)            # Remove special characters
    text = re.sub(r"\d+", "", text)                # Remove numbers (optional)
    text = text.lower()                            # Convert text to lowercase
    return text

# Apply the cleaning function
df['cleaned_text'] = df['text'].apply(clean_tweet)

# Check the cleaned data
print(df[['cleaned_text', 'target']].head())


                                        cleaned_text    target
0     a thats a bummer  you shoulda got david car...  negative
1  is upset that he cant update his facebook by t...  negative
2   i dived many times for the ball managed to sa...  negative
3    my whole body feels itchy and like its on fire   negative
4   no its not behaving at all im mad why am i he...  negative


In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Download the tokenizer data

# Tokenize each tweet
df['tokens'] = df['cleaned_text'].apply(word_tokenize)

# Check tokenized data
print(df[['tokens', 'target']].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


                                              tokens    target
0  [a, thats, a, bummer, you, shoulda, got, david...  negative
1  [is, upset, that, he, cant, update, his, faceb...  negative
2  [i, dived, many, times, for, the, ball, manage...  negative
3  [my, whole, body, feels, itchy, and, like, its...  negative
4  [no, its, not, behaving, at, all, im, mad, why...  negative


In [None]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')  # Download WordNet lemmatizer data
lemmatizer = WordNetLemmatizer()

# Lemmatize each word in each tweet
df['lemmatized_tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Check lemmatized tokens
print(df[['lemmatized_tokens', 'target']].head())


[nltk_data] Downloading package wordnet to /root/nltk_data...


                                   lemmatized_tokens    target
0  [a, thats, a, bummer, you, shoulda, got, david...  negative
1  [is, upset, that, he, cant, update, his, faceb...  negative
2  [i, dived, many, time, for, the, ball, managed...  negative
3  [my, whole, body, feel, itchy, and, like, it, ...  negative
4  [no, it, not, behaving, at, all, im, mad, why,...  negative


In [None]:
df['processed_text'] = df['lemmatized_tokens'].apply(lambda x: ' '.join(x))
print(df[['processed_text', 'target']].head())


                                      processed_text    target
0  a thats a bummer you shoulda got david carr of...  negative
1  is upset that he cant update his facebook by t...  negative
2  i dived many time for the ball managed to save...  negative
3       my whole body feel itchy and like it on fire  negative
4  no it not behaving at all im mad why am i here...  negative


In [None]:
# Add New Features
df['text_length'] = df['text'].apply(len)
df['exclamation_count'] = df['text'].apply(lambda x: x.count('!'))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
# Initialize the vectorizer with a maximum feature limit for simplicity
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)  # Limiting to 5000 words for efficiency

# Transform the cleaned text column into TF-IDF features
X = tfidf_vectorizer.fit_transform(df['processed_text'])

X_extra = hstack((X, df[['text_length', 'exclamation_count']].values))

# Check the shape of the feature matrix
print("TF-IDF feature matrix shape:", X.shape)


TF-IDF feature matrix shape: (1600000, 5000)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Encode target labels to numerical values
y = label_encoder.fit_transform(df['target'])

# Check the encoded labels
print("Encoded labels:", y[:5])


Encoded labels: [0 0 0 0 0]


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (1280000, 5000)
Testing set size: (320000, 5000)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Logistic Regression Accuracy: 0.79599375
Classification Report:
               precision    recall  f1-score   support

    negative       0.80      0.78      0.79    159494
    positive       0.79      0.81      0.80    160506

    accuracy                           0.80    320000
   macro avg       0.80      0.80      0.80    320000
weighted avg       0.80      0.80      0.80    320000



In [None]:
import numpy as np

# Find misclassified examples for logistic regression
misclassified_idx = np.where(y_test != y_pred)[0]
misclassified_samples = [(df['text'].iloc[i], y_test[i], y_pred[i]) for i in misclassified_idx[:5]]
for tweet, actual, predicted in misclassified_samples:
    print(f"Tweet: {tweet}\nActual: {label_encoder.classes_[actual]}\nPredicted: {label_encoder.classes_[predicted]}\n")


Tweet: @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
Actual: negative
Predicted: positive

Tweet: is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
Actual: negative
Predicted: positive

Tweet: @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
Actual: negative
Predicted: positive

Tweet: spring break in plain city... it's snowing 
Actual: positive
Predicted: negative

Tweet: @octolinz16 It it counts, idk why I did either. you never talk to me anymore 
Actual: negative
Predicted: positive



In [None]:
import joblib

# Save the logistic regression model
joblib.dump(log_reg, 'logistic_regression_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [None]:
# Load the model and vectorizer
loaded_model = joblib.load('logistic_regression_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Example of a new tweet
new_tweet = ["I'm so happy with this product!"]

# Preprocess and vectorize the new tweet
new_tweet_cleaned = [clean_tweet(new_tweet[0])]  # Simplify by using the existing cleaning function
new_tweet_vectorized = loaded_vectorizer.transform(new_tweet_cleaned)

# Predict sentiment
predicted_sentiment = loaded_model.predict(new_tweet_vectorized)
print("Predicted Sentiment:", label_encoder.inverse_transform(predicted_sentiment))


Predicted Sentiment: ['positive']


In [None]:
# Test with a few sample tweets
sample_tweets = [
    "I'm so happy with this product!",
    "This is the worst experience I've ever had.",
    "It's okay, nothing special."
]

for tweet in sample_tweets:
    tweet_cleaned = [clean_tweet(tweet)]
    tweet_vectorized = loaded_vectorizer.transform(tweet_cleaned)
    sentiment = loaded_model.predict(tweet_vectorized)
    print(f"Tweet: {tweet}\nPredicted Sentiment: {label_encoder.inverse_transform(sentiment)[0]}\n")


Tweet: I'm so happy with this product!
Predicted Sentiment: positive

Tweet: This is the worst experience I've ever had.
Predicted Sentiment: negative

Tweet: It's okay, nothing special.
Predicted Sentiment: positive

