In [2]:
# Load libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df = pd.read_csv('Twitter_Data.csv') # Read the data only with the columns that we need
df.head() # Show the first 5 rows of the data
df2 = pd.read_csv('Reddit_Data.csv')
df2 = df2.rename(columns={'clean_comment': 'clean_text'})
df = pd.concat([df, df2])

In [4]:
df.dropna(inplace=True) # Drop the missing values

In [5]:
df['category'] = df['category'].apply(lambda x: 0 if x == -1 else (1 if x == 0 else 2)) # Convert the labels to numerical values

In [6]:
X = df['clean_text'] # Features
y = df['category'] # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [7]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [8]:
model = LogisticRegression(max_iter=1000)  # Increasing max_iter for convergence
model.fit(X_train_tfidf, y_train)

In [9]:
# 3. Prediction and Evaluation
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Detailed performance report
print(classification_report(y_test, y_pred))

Accuracy: 93.13%
              precision    recall  f1-score   support

           0       0.91      0.84      0.87      4288
           1       0.93      0.98      0.95      6847
           2       0.94      0.94      0.94      8877

    accuracy                           0.93     20012
   macro avg       0.93      0.92      0.92     20012
weighted avg       0.93      0.93      0.93     20012


In [10]:
def predict_sentiment(model, text):
    text = text.lower() # Convert the text to lowercase
    # Convert the text to a pandas DataFrame
    df = pd.DataFrame([text], columns=['Tweet'])

    ds = vectorizer.transform(df['Tweet']) # Vectorize the text
    
    # Use the model to predict the sentiment
    prediction = model.predict(ds)

    # Since Logistic Regression directly returns the class, no need for argmax
    predicted_label = prediction[0]

    # Map the numerical label back to the original label
    sentiment_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    predicted_sentiment = sentiment_map[predicted_label]

    # Print the predicted sentiment
    print(f"{text} is {predicted_sentiment}")

In [11]:
predict_sentiment(model, "I am Gilad Fisher")
predict_sentiment(model, "I am Barak Finkel")
predict_sentiment(model, "I am Nikita Breslavsky")
predict_sentiment(model, "I hate Nikita Breslavsky")
predict_sentiment(model, "I love Nikita Breslavsky")
predict_sentiment(model, "I love to kill people")
predict_sentiment(model, "I dont love myself")
predict_sentiment(model, "I love to hate myself")

i am gilad fisher is Neutral
i am barak finkel is Neutral
i am nikita breslavsky is Neutral
i hate nikita breslavsky is Negative
i love nikita breslavsky is Positive
i love to kill people is Positive
i dont love myself is Positive
i love to hate myself is Negative
