## Sentiment Analysis using ML


Reference Links:

https://neptune.ai/blog/exploratory-data-analysis-natural-language-processing-tools

In [None]:
import pandas as pd

sentiment_df = pd.read_csv("sentiment_dataset.csv")
sentiment_df.head()

sentiment_df.Sentiment.value_counts()

## EDA - Sentiment Dataset

In [None]:

sentiment_df.head()
# print(sentiment_df.tail())
print('\n=============================================================\n')

# print(sentiment_df.info())

print('### Numerical features ###','\n')
print(sentiment_df.describe(exclude=['O']))

print('\n=============================================================\n')
# print(sentiment_df['Sentiments'].value_counts())

print("Number of duplicates: " + str(sentiment_df.duplicated().sum()))
# sentiment_df.drop_duplicates(inplace=True)

print('\n=============================================================\n')

# print(sentiment_df.isnull().sum())

# # df.dropna(inplace=True)  # Drop rows with missing values
# # Or
# # df.fillna(<< write the value >>, inplace=True)  # Fill missing values with a specific value

print('\n=============================================================\n')

# sentiment_df.dtypes

print('\n=============================================================\n')

# # Creating New column with actual labels...
# sentiment_df['Labels'] = 'Negative'
# sentiment_df.loc[sentiment_df['Sentiments'], 'Labels'] = 'Positive'

print('\n=============================================================\n')

# New column to store word counts from Reviews column
sentiment_df['Word_Count'] = sentiment_df['Review'].str.split().str.len()

# # Define a function to count words in a review
# def count_words(review):
#     return len(review.split())

# # Apply the function to each review in the 'Review' column and create a new column 'Word_Count'
# df['Word_Count'] = df['Review'].apply(count_words)



## Data Processing

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

reviews = sentiment_df['Review']
sentiments = sentiment_df['Sentiment']

# You can adjust max_features as needed
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

text_data = tfidf_vectorizer.fit_transform(reviews)

# splitting X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_data, sentiments,
                                       test_size=0.2,
                                       random_state=17)

## Applying Logistic Regression Classifier

In [None]:
# create logistic regression object
model = LogisticRegression()

# Step 5: Model Training
# train the model using the training sets
model.fit(X_train, y_train)

# Step 6: Model Evaluation
# making predictions on the testing set
y_pred = model.predict(X_test)

# comparing actual response values (y_test)
# with predicted response values (y_pred)
print("Logistic Regression model accuracy(in %):", accuracy_score(y_test, y_pred)*100)

# test review
new_review = "This is a new review that needs to be classified good movie."

# Preprocess the new string using the same TF-IDF vectorizer
new_review_features = tfidf_vectorizer.transform([new_review])

# Predict the target label for the new string using the trained model
predicted_label = model.predict(new_review_features)

# Print the predicted label
print("Predicted Label:", "Positive Sentiment" if predicted_label[0] else "Negative Sentiment" )

## Applying Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Step 5: Model Training
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of estimators as needed
rf_model.fit(X_train, y_train)

# Step 6: Model Evaluation
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy (Random Forest):", accuracy_rf)

new_review = "This is a new review that needs to be classified good movie."
# Preprocess the new string using the same TF-IDF vectorizer
new_review_features = tfidf_vectorizer.transform([new_review])

# Testing with a new string
predicted_label_rf = rf_model.predict(new_review_features)
print("Predicted Label (Random Forest):", predicted_label_rf[0])
