In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/final_dataset_basicmlmodel.csv')

# Display the first few rows and check column names
print(data.head())
print(data.info())


   id  label                                              tweet
0   1      0   @user when a father is dysfunctional and is s...
1   2      0  @user @user thanks for #lyft credit i can't us...
2   3      0                                bihday your majesty
3   4      0  #model   i love u take with u all the time in ...
4   5      0             factsguide: society now    #motivation
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5242 entries, 0 to 5241
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5242 non-null   int64 
 1   label   5242 non-null   int64 
 2   tweet   5242 non-null   object
dtypes: int64(2), object(1)
memory usage: 123.0+ KB
None


In [5]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

# Instantiate a lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Apply text cleaning
data['cleaned_text'] = data['tweet'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features for complexity

# Transform the cleaned text
X = tfidf.fit_transform(data['cleaned_text']).toarray()
y = data['label']  # Replace with the actual name of the target column


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)


In [9]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)


Accuracy: 0.87
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.91      0.89       606
           1       0.87      0.81      0.84       443

    accuracy                           0.87      1049
   macro avg       0.87      0.86      0.86      1049
weighted avg       0.87      0.87      0.87      1049



In [10]:
# Define a new review to test
new_review = "This product is really amazing, I loved it!"

# Step 1: Clean the review using the same cleaning function
cleaned_review = clean_text(new_review)

# Step 2: Transform the cleaned review using the TF-IDF vectorizer
review_tfidf = tfidf.transform([cleaned_review])  # Note: .transform, not .fit_transform

# Step 3: Predict the label
predicted_label = model.predict(review_tfidf)

# Display the result
print(f"The predicted sentiment for the review is: {predicted_label[0]}")


The predicted sentiment for the review is: 0
