In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\agnik\.cache\kagglehub\datasets\jp797498e\twitter-entity-sentiment-analysis\versions\2


In [4]:
import pandas as pd

df = pd.read_csv(path + "/twitter_training.csv", header=None)
df.columns = ["Tweet_ID", "Entity", "Sentiment", "Tweet_Content"]

In [5]:
df.head()

Unnamed: 0,Tweet_ID,Entity,Sentiment,Tweet_Content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [6]:
df.isnull().sum()

Tweet_ID           0
Entity             0
Sentiment          0
Tweet_Content    686
dtype: int64

In [7]:
import re

def clean_text(text):
    # Check if text is a string before applying lower()
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation and special characters
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    # If not a string, return it as is or handle it as needed
    return text

df['Tweet_Content'] = df['Tweet_Content'].apply(clean_text)


In [8]:
df.drop_duplicates(subset=['Tweet_Content'], inplace=True)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [10]:
# Prepare the data for modeling
X = df['Tweet_Content']
y = df['Sentiment']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

# Handle missing values (NaN) by replacing them with empty strings
X_train = X_train.fillna('')  
X_test = X_test.fillna('')  

# Create a TF-IDF vectorizer
vectorizer = CountVectorizer()  

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = vectorizer.transform(X_test)

# Create and train the Logistic Regression model
model = LogisticRegression(max_iter=1000)  
model.fit(X_train_tfidf, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy: 0.8419047619047619
              precision    recall  f1-score   support

  Irrelevant       0.86      0.78      0.82      2460
    Negative       0.85      0.89      0.87      4108
     Neutral       0.86      0.80      0.83      3317
    Positive       0.81      0.87      0.84      3765

    accuracy                           0.84     13650
   macro avg       0.84      0.83      0.84     13650
weighted avg       0.84      0.84      0.84     13650



In [11]:
from sklearn.neighbors import KNeighborsClassifier

# Create and train the K-Nearest Neighbors model
model1 = KNeighborsClassifier(n_neighbors=5)  
model1.fit(X_train_tfidf, y_train)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [13]:
from sklearn.metrics import accuracy_score

# Make predictions on the testing data

y_pred = model1.predict(X_test_tfidf)

# Evaluate the model

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
X_train.isnull().sum()
X_train = X_train.fillna('')  
X_test = X_test.fillna('')

Accuracy: 0.8638827838827838


In [14]:
X_train = X_train.fillna('')  
X_test = X_test.fillna('')

In [15]:
X_test.isnull().sum()

0

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier  # Assuming you're using LogisticRegression as the model
import string

# Define the clean_text function to preprocess the text
def clean_text(text):
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Lowercase the text
    text = text.lower()
    # You can add more cleaning steps if necessary (e.g., removing stopwords)
    return text

# Example setup of a vectorizer and model (ensure these are pre-trained in your real application)
# For the sake of this example, we're using dummy data and a simple model
# Train a simple model (replace this with your actual trained model and vectorizer)
corpus = ["I love this product", "I hate this", "This is amazing", "Not good"]
labels = [1, 0, 1, 0]  # 1 for positive, 0 for negative

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)  # Fit and transform the corpus (train the vectorizer)
model1 = KNeighborsClassifier(n_neighbors=3)
model1.fit(X, labels)  # Train the model

# Get user input for a new tweet
user_tweet = input("Whats on your Mind? ")

# Preprocess the user's tweet
user_tweet_cleaned = clean_text(user_tweet)

# Transform the user's tweet into TF-IDF format
user_tweet_tfidf = vectorizer.transform([user_tweet_cleaned])  # Notice the [ ] to make it a list

# Make prediction
user_tweet_prediction = model1.predict(user_tweet_tfidf)

# Print prediction
print(f"Predicted Sentiment for your tweet: {user_tweet_prediction[0]}")  # Access the prediction from the list


Predicted Sentiment for your tweet: 1


In [10]:
import pickle



with open('model.pkl', 'wb') as f:
    pickle.dump((vectorizer, model1), f)

print("Model and vectorizer saved successfully!")


with open('model.pkl', 'rb') as f:
    vectorizer, model1 = pickle.load(f)
    print("Model and vectorizer loaded successfully!")
    user_tweet = input("Whats on your Mind? ")
    user_tweet_cleaned = clean_text(user_tweet)
    user_tweet_tfidf = vectorizer.transform([user_tweet_cleaned])
    user_tweet_prediction = model1.predict(user_tweet_tfidf)
    print(f"Predicted Sentiment for your tweet: {user_tweet_prediction[0]}")

Model and vectorizer saved successfully!
Model and vectorizer loaded successfully!
Predicted Sentiment for your tweet: 1
