In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb


In [None]:
# Load the cleaned dataset
df = pd.read_csv('Cleaned_data.csv')

In [None]:
# Function to get compound sentiment score
def get_compound_sentiment_score(text):
    if pd.isna(text):
        return 0  # Return a neutral score for missing reviews
    return analyzer.polarity_scores(str(text))['compound']

# Convert 'Reviews' column to string and apply function to the dataset
df['Reviews'] = df['Reviews'].astype(str)
df['compound_sentiment_score'] = df['Reviews'].apply(get_compound_sentiment_score)

In [None]:
df.head()

Unnamed: 0,Reviews,Rating,Rev_len,compound_sentiment_score
0,decide eat aware going take hours beginning en...,3,513,0.8968
1,second time tried turning point location first...,2,477,-0.3535
2,place cute staff friendly nice menu good brunc...,4,216,0.9538
3,came saturday morning waiting months opening h...,3,736,0.8965
4,mediocre best decor nice like restaurant tryin...,2,953,0.905


In [None]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf.fit_transform(df['Reviews'])

# Convert 'Rating' and 'Rev_len' to a sparse matrix
other_features = csr_matrix(df[['Rating', 'Rev_len']].astype(float))

# Combine TF-IDF features with 'Rating' and 'Rev_len'
combined_features = hstack([tfidf_matrix, other_features])

In [None]:
# Convert the sparse matrix to a dense format and display it
combined_features_dense = combined_features.toarray()

print(combined_features_dense[:5])

[[  0.   0.   0. ...   0.   3. 513.]
 [  0.   0.   0. ...   0.   2. 477.]
 [  0.   0.   0. ...   0.   4. 216.]
 [  0.   0.   0. ...   0.   3. 736.]
 [  0.   0.   0. ...   0.   2. 953.]]


In [None]:
combined_features.shape

(64282, 1002)

In [None]:
df['compound_sentiment_score'].shape

(64282,)

In [None]:
# train test split here for cross validation of the model
from sklearn.model_selection import train_test_split
train_df_x, test_df_x, train_df_y, test_df_y = train_test_split(combined_features,
                                                        df['compound_sentiment_score'],
                                                        test_size=0.2,
                                                        random_state=42
                                                       )

In [None]:
'''# Splitting the dataset into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # You can adjust the number of max_features as needed

# Fit and transform the TF-IDF vectorizer on the training data
tfidf_matrix_train = tfidf.fit_transform(train_df['Reviews'])

# Transform the test data using the same vectorizer
tfidf_matrix_test = tfidf.transform(test_df['Reviews'])

# Convert 'Rating' and 'Rev_len' to a sparse matrix for both train and test data
other_features_train = csr_matrix(train_df[['Rating', 'Rev_len']].astype(float))
other_features_test = csr_matrix(test_df[['Rating', 'Rev_len']].astype(float))

# Combine TF-IDF features with 'Rating' and 'Rev_len' for both train and test data
combined_features_train = hstack([tfidf_matrix_train, other_features_train])
combined_features_test = hstack([tfidf_matrix_test, other_features_test])
'''

In [None]:
# Model building
model = LinearRegression()
model.fit(train_df_x,train_df_y)

# Predictions
y_pred = model.predict(test_df_x)

In [None]:
# Evaluation Metrics
mse = mean_squared_error(test_df_y, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(test_df_y, y_pred)
mae = mean_absolute_error(test_df_y, y_pred)
explained_variance = explained_variance_score(test_df_y, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared Score:", r2)
print("Mean Absolute Error:", mae)
print("Explained Variance Score:", explained_variance)

Mean Squared Error: 0.07019703643622935
Root Mean Squared Error: 0.2649472333054817
R-squared Score: 0.6000371038044646
Mean Absolute Error: 0.17807340508346922
Explained Variance Score: 0.6000408313338514


In [None]:
# Define a function to categorize sentiment scores
def categorize_sentiment(score):
    if score <= -0.05:
        return 'negative'
    elif score >= 0.05:
        return 'positive'
    else:
        return 'neutral'

# Apply the function to the true values and predictions
true_categories = test_df_y.apply(categorize_sentiment)
predicted_categories = pd.Series(y_pred).apply(categorize_sentiment)

# Confusion Matrix
conf_matrix = confusion_matrix(true_categories, predicted_categories, labels=['negative', 'neutral', 'positive'])

# Display the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report for additional metrics
print("\nClassification Report:")
print(classification_report(true_categories, predicted_categories, labels=['negative', 'neutral', 'positive']))


Confusion Matrix:
[[  295   106   546]
 [    6     7   126]
 [   46    38 11687]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.31      0.46       947
     neutral       0.05      0.05      0.05       139
    positive       0.95      0.99      0.97     11771

    accuracy                           0.93     12857
   macro avg       0.61      0.45      0.49     12857
weighted avg       0.93      0.93      0.92     12857



In [None]:
# Assuming df is your DataFrame and combined_features are your features
# df = pd.read_csv('path_to_your_dataset.csv')

# Categorize the sentiment scores if not already done
def categorize_sentiment(score):
    if score <= -0.05:
        return 0
    elif score >= 0.05:
        return 2
    else:
        return 1

test_df_y_c = test_df_y.apply(categorize_sentiment)
train_df_y_c = train_df_y.apply(categorize_sentiment)

# Gradient Boosting Classifier
gbc = GradientBoostingClassifier()
gbc.fit(train_df_x, train_df_y_c)
gbc_pred = gbc.predict(test_df_x)

# XGBoost Classifier
xgbc = xgb.XGBClassifier()
xgbc.fit(train_df_x, train_df_y_c)
xgbc_pred = xgbc.predict(test_df_x)

# Random Forest Classifier
rfc = RandomForestClassifier()
rfc.fit(train_df_x, train_df_y_c)
rfc_pred = rfc.predict(test_df_x)


In [None]:
# Evaluation
for model, prediction in zip(['Gradient Boosting', 'XGBoost', 'Random Forest'],
                             [gbc_pred, xgbc_pred, rfc_pred]):
    print(f"Model: {model}")
    print(classification_report(test_df_y_c, prediction))
    print(confusion_matrix(test_df_y_c, prediction))
    print("\n")

Model: Gradient Boosting
              precision    recall  f1-score   support

           0       0.71      0.53      0.61       947
           1       0.10      0.01      0.03       139
           2       0.96      0.99      0.97     11771

    accuracy                           0.94     12857
   macro avg       0.59      0.51      0.53     12857
weighted avg       0.93      0.94      0.93     12857

[[  503     6   438]
 [   38     2    99]
 [  163    13 11595]]


Model: XGBoost
              precision    recall  f1-score   support

           0       0.74      0.59      0.65       947
           1       0.39      0.05      0.09       139
           2       0.96      0.99      0.97     11771

    accuracy                           0.95     12857
   macro avg       0.70      0.54      0.57     12857
weighted avg       0.94      0.95      0.94     12857

[[  555     8   384]
 [   41     7    91]
 [  155     3 11613]]


Model: Random Forest
              precision    recall  f1-score  

In [None]:
# Initialize an empty DataFrame to store results
results = []

# Evaluation
for model, prediction in zip(['Gradient Boosting', 'XGBoost', 'Random Forest'],
                             [gbc_pred, xgbc_pred, rfc_pred]):
    report = classification_report(test_df_y_c, prediction, output_dict=True)
    conf_matrix = confusion_matrix(test_df_y_c, prediction)

    # Extracting relevant metrics from the report
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1_score = report['macro avg']['f1-score']
    accuracy = report['accuracy']

    # Appending results to the DataFrame
    results.append({
        'Model': model,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1_score,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix
    })

# Create DataFrame from results
results_df = pd.DataFrame(results)

# Display the results
print(results_df.to_string(index=False))


            Model  Precision   Recall  F1-Score  Accuracy                               Confusion Matrix
Gradient Boosting   0.588488 0.510196  0.534833  0.941122 [[503, 6, 438], [38, 2, 99], [163, 13, 11595]]
          XGBoost   0.696203 0.540999  0.572117  0.946955  [[555, 8, 384], [41, 7, 91], [155, 3, 11613]]
    Random Forest   0.773877 0.461198  0.506898  0.937544  [[341, 2, 604], [20, 4, 115], [61, 1, 11709]]


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid for Gradient Boosting Classifier
gbc_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Grid search for Gradient Boosting Classifier
gbc_grid = GridSearchCV(GradientBoostingClassifier(), gbc_param_grid, cv=3, n_jobs=-1, verbose=2)
gbc_grid.fit(train_df_x, train_df_y_c)
gbc_best = gbc_grid.best_estimator_
gbc_pred = gbc_best.predict(test_df_x)

# Define hyperparameter grid for XGBoost Classifier
xgbc_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Grid search for XGBoost Classifier
xgbc_grid = GridSearchCV(xgb.XGBClassifier(), xgbc_param_grid, cv=3, n_jobs=-1, verbose=2)
xgbc_grid.fit(train_df_x, train_df_y_c)
xgbc_best = xgbc_grid.best_estimator_
xgbc_pred = xgbc_best.predict(test_df_x)

# Define hyperparameter grid for Random Forest Classifier
rfc_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Grid search for Random Forest Classifier
rfc_grid = GridSearchCV(RandomForestClassifier(), rfc_param_grid, cv=3, n_jobs=-1, verbose=2)
rfc_grid.fit(train_df_x, train_df_y_c)
rfc_best = rfc_grid.best_estimator_
rfc_pred = rfc_best.predict(test_df_x)
