In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import nltk
import os


In [30]:
nltk_data_path = r"C:\Users\Administrator\Desktop\Data Science\Result\Sentiment_Analysis\nltk_data"
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)


In [31]:
nltk.download('punkt_tab',download_dir=nltk_data_path)
nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Administrator\Desktop\Data
[nltk_data]     Science\Result\Sentiment_Analysis\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\Desktop\Data
[nltk_data]     Science\Result\Sentiment_Analysis\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\Desktop\Data
[nltk_data]     Science\Result\Sentiment_Analysis\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\Desktop\Data
[nltk_data]     Science\Result\Sentiment_Analysis\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
try:
    print("Stopwords loaded:", stopwords.words('english')[:5])  # Verify stopwords
    print("Tokenizer test:", word_tokenize("This is a test sentence."))  # Verify tokenizer
except LookupError as e:
    print("Error loading NLTK resources:", e)
    raise


Stopwords loaded: ['i', 'me', 'my', 'myself', 'we']
Tokenizer test: ['This', 'is', 'a', 'test', 'sentence', '.']


In [33]:
file_path = '../../anonymized.csv'  # Adjust the path if needed
df = pd.read_csv(file_path)
print("Dataset Info:")
df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2627 entries, 0 to 2626
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   review_ID            2627 non-null   object 
 1   business_ID          2627 non-null   object 
 2   business_name        2627 non-null   object 
 3   business_category    2627 non-null   object 
 4   review_live_days     2627 non-null   float64
 5   review_message       2607 non-null   object 
 6   review_rating        2627 non-null   float64
 7   review_status        2627 non-null   object 
 8   review_warmup_prior  1940 non-null   object 
 9   review_notes         1937 non-null   object 
 10  review_created_date  2627 non-null   object 
 11  review_removed_date  687 non-null    object 
dtypes: float64(2), object(10)
memory usage: 246.4+ KB


In [34]:
df = df.dropna(subset=['review_message'])


In [35]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Preprocess a single text: lowercase, remove punctuation, tokenize, remove stopwords, and lemmatize."""
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and numbers
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

In [36]:
df['cleaned_review'] = df['review_message'].apply(preprocess_text)

# Display a few samples to verify preprocessing
print("Sample of cleaned reviews:")
print(df[['review_message', 'cleaned_review']].head())

Sample of cleaned reviews:
                                      review_message  \
0  Personally liked how they have a park for dogs...   
1  It's still a good playground to visit with kid...   
2  A small area/playground for kids and some picn...   
3                       One of the best parks for me   
4  Its a good location, however, the place needs ...   

                                      cleaned_review  
0  personally liked park dog facility could impro...  
1  still good playground visit kid good morning walk  
2  small areaplayground kid picnic good enough visit  
3                                      one best park  
4  good location however place need bit improveme...  


In [37]:
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_text = vectorizer.fit_transform(df['cleaned_review'])

# Display TF-IDF features and sample vectorized data
print("TF-IDF Feature Names:", vectorizer.get_feature_names_out()[:10])
print("Sample TF-IDF Vector:", X_text[0].toarray())

# Save processed data if needed
df['cleaned_review'] = df['cleaned_review'].values

# Save the DataFrame to a CSV file
df.to_csv('processed_reviews.csv', index=False)


TF-IDF Feature Names: ['aai' 'abandoned' 'abbeville' 'abbey' 'able' 'absolute' 'absolutely'
 'abundance' 'acacia' 'accept']
Sample TF-IDF Vector: [[0. 0. 0. ... 0. 0. 0.]]


In [38]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from tqdm import tqdm  # Import tqdm for progress bar

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # For positive, negative, neutral

# Tokenize the review text with tqdm to display progress
def predict_sentiment(review_texts):
    predictions = []
    for i, text in tqdm(enumerate(review_texts), total=len(review_texts), desc="Processing reviews"):
        # Tokenize the review text
        inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

        # Predict sentiment labels
        with torch.no_grad():
            outputs = model(**inputs)
            predicted_label = torch.argmax(outputs.logits, dim=1).item()  # Get the label as a single integer
            predictions.append(predicted_label)

    return predictions

# Apply sentiment analysis to the reviews and map the results to sentiment labels
sentiment_predictions = predict_sentiment(df['cleaned_review'].tolist())

# Map predictions to sentiment labels (e.g., 0 = negative, 1 = neutral, 2 = positive)
df['sentiment'] = [['negative', 'neutral', 'positive'][prediction] for prediction in sentiment_predictions]

# Display a sample of the sentiment predictions
print(df[['cleaned_review', 'sentiment']].head())


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing reviews: 100%|██████████| 2607/2607 [03:39<00:00, 11.89it/s]

                                      cleaned_review sentiment
0  personally liked park dog facility could impro...  negative
1  still good playground visit kid good morning walk  negative
2  small areaplayground kid picnic good enough visit  negative
3                                      one best park  negative
4  good location however place need bit improveme...  negative





In [15]:
import nltk
nltk.download('vader_lexicon')  # Download VADER lexicon
from nltk.sentiment.vader import SentimentIntensityAnalyzer  
sia = SentimentIntensityAnalyzer()

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

def get_sentiment_score(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound'] 

# Apply sentiment score calculation to get a numerical sentiment score
df['sentiment_score'] = df['cleaned_review'].apply(get_sentiment_score)

# Prepare features and target
X = df[['sentiment_score']]  # Use sentiment_score as the feature for regression
y = df['review_rating']  # Target is the review rating

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Train Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predictions and evaluation
rf_pred = rf_model.predict(X_test)
lr_pred = lr_model.predict(X_test)

# Output evaluation metrics
print("Random Forest MSE:", mean_squared_error(y_test, rf_pred))
print("Linear Regression MSE:", mean_squared_error(y_test, lr_pred))


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Random Forest MSE: 0.6482256869227419
Linear Regression MSE: 0.5844277159022991


In [18]:
# Evaluate Random Forest on training set
rf_train_pred = rf_model.predict(X_train)
rf_train_mse = mean_squared_error(y_train, rf_train_pred)
print("Random Forest Training MSE:", rf_train_mse)

# Evaluate Linear Regression on training set
lr_train_pred = lr_model.predict(X_train)
lr_train_mse = mean_squared_error(y_train, lr_train_pred)
print("Linear Regression Training MSE:", lr_train_mse)

# Compare training and testing MSE for signs of overfitting
print("Random Forest Test MSE:", mean_squared_error(y_test, rf_pred))
print("Linear Regression Test MSE:", mean_squared_error(y_test, lr_pred))


Random Forest Training MSE: 0.37660916424562335
Linear Regression Training MSE: 0.556409723232153
Random Forest Test MSE: 0.6482256869227419
Linear Regression Test MSE: 0.5844277159022991


In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [10, 20, 50],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True],
    'max_samples': [0.5, 0.7,1],  # Include max_samples only if bootstrap=True
}

# Initialize GridSearchCV with verbose=3 to show progress of grid search fitting
grid_search = GridSearchCV(estimator=RandomForestRegressor(),
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           verbose=3)  # Verbose output for showing progress

grid_search.fit(X_train, y_train)

# Get the best parameters and best score
print("Best Random Forest Parameters:", grid_search.best_params_)
print("Best Random Forest Cross-validation MSE:", -grid_search.best_score_)

# Train the model with the best parameters
rf_model_best = grid_search.best_estimator_

# Evaluate performance on test data
rf_pred_best = rf_model_best.predict(X_test)
print("Random Forest Test MSE with Best Params:", mean_squared_error(y_test, rf_pred_best))

# Display a summary of the grid search results
print("\nGrid Search Results Summary:")
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation MSE:", -grid_search.best_score_)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
[CV 1/5] END bootstrap=True, max_depth=5, max_features=sqrt, max_samples=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=-0.573 total time=   0.0s
[CV 2/5] END bootstrap=True, max_depth=5, max_features=sqrt, max_samples=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=-0.586 total time=   0.0s
[CV 3/5] END bootstrap=True, max_depth=5, max_features=sqrt, max_samples=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=-0.641 total time=   0.0s
[CV 4/5] END bootstrap=True, max_depth=5, max_features=sqrt, max_samples=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=-0.426 total time=   0.0s
[CV 5/5] END bootstrap=True, max_depth=5, max_features=sqrt, max_samples=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=10;, score=-0.522 total time=   0.0s
[CV 1/5] END bootstrap=True, max_depth=5, max_features=sqrt, max_samples=0.5, min_samples_l

In [20]:
# Compare models
print("Random Forest Test MSE:", mean_squared_error(y_test, rf_pred))
print("Linear Regression Test MSE:", mean_squared_error(y_test, lr_pred))

# Select the best model
if mean_squared_error(y_test, rf_pred) < mean_squared_error(y_test, lr_pred):
    print("Random Forest is the better model.")
    final_model = rf_model
else:
    print("Linear Regression is the better model.")
    final_model = lr_model


Random Forest Test MSE: 0.6482256869227419
Linear Regression Test MSE: 0.5844277159022991
Linear Regression is the better model.


In [21]:
from xgboost import XGBRegressor

# Train XGBoost Regressor
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

# Predictions and evaluation
xgb_pred = xgb_model.predict(X_test)
print("XGBoost Test MSE:", mean_squared_error(y_test, xgb_pred))


XGBoost Test MSE: 0.604468642986895


In [22]:
from sklearn.svm import SVR

# Train SVM Regressor
svm_model = SVR()
svm_model.fit(X_train, y_train)

# Predictions and evaluation
svm_pred = svm_model.predict(X_test)
print("SVM Test MSE:", mean_squared_error(y_test, svm_pred))


SVM Test MSE: 0.6621805965736356


In [23]:
# Example: Adding word count as a feature
df['word_count'] = df['cleaned_review'].apply(lambda x: len(x.split()))

# Update feature set to include word count
X = df[['sentiment_score', 'word_count']]  # Add other features if needed


In [26]:
import pickle

# Save the final model using pickle
with open('final_model.pkl', 'wb') as model_file:
    pickle.dump(final_model, model_file)


In [27]:

def predict_rating(review_text):
    # Preprocess the review text
    cleaned_text = preprocess_text(review_text)

    # Get sentiment score
    sentiment_score = get_sentiment_score(cleaned_text)

    # Create a DataFrame with feature names matching the training data
    feature_vector = pd.DataFrame([[sentiment_score]], columns=['sentiment_score'])

    with open('final_model.pkl', 'rb') as f:
        model = pickle.load(f)

    predicted_rating = model.predict(feature_vector)

    return predicted_rating[0] 


In [28]:
review_text = "This place was excellent, I loved it!"
predicted_rating = predict_rating(review_text)
print(f"The predicted rating for the review is: {predicted_rating}")


The predicted rating for the review is: 4.372690667074303
