In [19]:
# Load libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import nltk  #natural language processing
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nickbres/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
df = pd.read_csv('Twitter_Data.csv') # Read the data only with the columns that we need
df.head() # Show the first 5 rows of the data
df2 = pd.read_csv('Reddit_Data.csv')
df2 = df2.rename(columns={'clean_comment': 'clean_text'})
df = pd.concat([df, df2])
df.reset_index(drop=True, inplace=True) # reset the index

In [21]:
df.dropna(inplace=True) # Drop the missing values

In [22]:
df['category'] = df['category'].apply(lambda x: 0 if x == -1 else (1 if x == 0 else 2)) # Convert the labels to numerical values

In [23]:

def post_to_words(post):
    ''' Convert tweet text into a sequence of words '''
    # convert to lower case
    text = post.lower()
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words

print("\nOriginal tweet ->", df['clean_text'][0])
print("\nProcessed tweet ->", post_to_words(df['clean_text'][0]))


Original tweet -> when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples

Processed tweet -> ['modi', 'promis', 'minimum', 'govern', 'maximum', 'govern', 'expect', 'begin', 'difficult', 'job', 'reform', 'state', 'take', 'year', 'get', 'justic', 'state', 'busi', 'exit', 'psu', 'templ']


In [29]:
y = df['category'] # Target
X = [' '.join(post_to_words(text)) for text in df['clean_text']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [30]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [31]:
model = LogisticRegression(max_iter=1000)  # Increasing max_iter for convergence
model.fit(X_train_tfidf, y_train)

In [32]:
# 3. Prediction and Evaluation
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Detailed performance report
print(classification_report(y_test, y_pred))

Accuracy: 85.21%
              precision    recall  f1-score   support

           0       0.82      0.73      0.77      4288
           1       0.83      0.92      0.87      6847
           2       0.89      0.86      0.87      8877

    accuracy                           0.85     20012
   macro avg       0.85      0.84      0.84     20012
weighted avg       0.85      0.85      0.85     20012


In [37]:
def predict_sentiment(model, text):
    text = text.lower() # Convert the text to lowercase
    # Convert the text to a pandas DataFrame
    df = pd.DataFrame([text], columns=['Tweet'])
    
    # Preprocess the text
    df['Tweet'] = df['Tweet'].apply(post_to_words)
    df['Tweet'] = df['Tweet'].apply(lambda x: ' '.join(x))

    ds = vectorizer.transform(df['Tweet']) # Vectorize the text
    
    # Use the model to predict the sentiment
    prediction = model.predict(ds)

    # Since Logistic Regression directly returns the class, no need for argmax
    predicted_label = prediction[0]

    # Map the numerical label back to the original label
    sentiment_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    predicted_sentiment = sentiment_map[predicted_label]

    # Print the predicted sentiment
    print(f"{text} -> {df['Tweet'][0]} -> {predicted_sentiment}")

In [41]:

predict_sentiment(model, "I am Nikita Breslavsky")
predict_sentiment(model, "I hate Nikita Breslavsky")
predict_sentiment(model, "I love Nikita Breslavsky")
predict_sentiment(model, "I love to kill people")
predict_sentiment(model, "I love to hate myself")
predict_sentiment(model, "I hate to love myself")

i am nikita breslavsky -> nikita breslavski -> Neutral
i hate nikita breslavsky -> hate nikita breslavski -> Negative
i love nikita breslavsky -> love nikita breslavski -> Positive
i love to kill people -> love kill peopl -> Positive
i love to hate myself -> love hate -> Negative
i hate to love myself -> hate love -> Negative
