# Explore here

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load your dataset
file_path = '../data/raw/iphone.csv'  # Adjust to your local path
iphone_reviews = pd.read_csv(file_path)

# Clean text (remove special characters, make lowercase)
def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', str(text))  # Remove non-alphabet characters
    return text.lower()

# Apply cleaning to review descriptions
iphone_reviews['reviewDescription'] = iphone_reviews['reviewDescription'].fillna('')
iphone_reviews['cleaned_review'] = iphone_reviews['reviewDescription'].apply(clean_text)

# Create sentiment labels based on ratingScore
def label_sentiment(rating):
    if rating <= 2:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'

iphone_reviews['sentiment'] = iphone_reviews['ratingScore'].apply(label_sentiment)

# Split the data into training and test sets
X = iphone_reviews['cleaned_review']
y = iphone_reviews['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data, and transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train Logistic Regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_tfidf, y_train)

# Make predictions and evaluate
y_pred_logreg = logreg.predict(X_test_tfidf)
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_logreg))


In [None]:
from sklearn.naive_bayes import MultinomialNB

# Train Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

# Make predictions and evaluate
y_pred_nb = nb.predict(X_test_tfidf)
print("Naive Bayes Performance:")
print(classification_report(y_test, y_pred_nb))


In [None]:
from sklearn.svm import SVC

# Train SVM model
svm = SVC()
svm.fit(X_train_tfidf, y_train)

# Make predictions and evaluate
y_pred_svm = svm.predict(X_test_tfidf)
print("SVM Performance:")
print(classification_report(y_test, y_pred_svm))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Initialize and train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)

# Make predictions and evaluate
y_pred_rf = rf.predict(X_test_tfidf)
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))


In [None]:
# Collect model performance results for comparison
from sklearn.metrics import accuracy_score

models = {
    'Logistic Regression': logreg,
    'Naive Bayes': nb,
    'SVM': svm,
    'Random Forest': rf
}

# Evaluate and print accuracy of each model
for name, model in models.items():
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define a parameter grid to search through
param_grid = {
    'C': [0.1, 1, 10, 100],  # Penalty parameter
    'kernel': ['linear', 'rbf'],  # Linear and Radial Basis Function kernels
    'gamma': ['scale', 'auto']  # Kernel coefficient
}

# Set up the GridSearchCV to find the best combination of hyperparameters
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=1)

# Fit the grid search model
grid_search.fit(X_train_tfidf, y_train)

# Print the best hyperparameters found
print("Best Parameters: ", grid_search.best_params_)

# Evaluate the tuned model
y_pred_svm_tuned = grid_search.best_estimator_.predict(X_test_tfidf)
print("Tuned SVM Performance:")
print(classification_report(y_test, y_pred_svm_tuned))


In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  


In [None]:
svm = SVC(class_weight='balanced')


Dataset Characteristics:

The dataset consists of iPhone reviews from Amazon, with three sentiment classes: positive, neutral, and negative.
The reviews are largely skewed towards the positive class (with most ratings being 4 or 5 stars), while neutral and negative reviews are relatively fewer in number.
This class imbalance presents a challenge, especially for predicting neutral sentiment, which the model struggles with.
Model Performance:

After tuning the SVM model using a radial basis function (RBF) kernel, the overall accuracy improved to 85%. This indicates that the model is quite effective at classifying the reviews into the appropriate sentiment categories.
The model performs exceptionally well in predicting positive sentiment, achieving a high F1-score of 0.92 and recall of 0.97.
Predictions for negative sentiment also improved, with a reasonable F1-score of 0.72. However, there’s still room to enhance recall, suggesting that some negative reviews may not be correctly identified.
The neutral sentiment is where the model performs weakest, with an F1-score of 0.34. This is likely due to the class imbalance and the inherent difficulty in distinguishing neutral reviews from either positive or negative ones.
Model Implications:

The model is highly effective for classifying positive reviews, making it suitable for applications that prioritize identifying strong customer satisfaction.
It performs moderately well in detecting negative reviews, which could still be useful for flagging potentially dissatisfied customers. However, improvements in recall could ensure that more negative reviews are caught.
The inability to accurately detect neutral sentiment means that the model may struggle to identify mixed or balanced feedback, where users may express both pros and cons. In real-world applications, this could lead to an overemphasis on extreme sentiments (positive or negative).
Use Case Applications:

Customer Feedback Analysis: The model is well-suited for analyzing customer feedback at scale, particularly for identifying highly satisfied or dissatisfied customers.
Market Insights: Companies can use the model to understand the distribution of sentiment in product reviews, helping them gauge product reception and improve areas of concern based on negative reviews.
Customer Service Prioritization: Businesses could use the model to prioritize handling negative reviews quickly, while understanding that neutral feedback may require further refinement in the model to be detected accurately.

In [None]:
import pickle

# Save the model regression
model_filename = '../models/logistic_regression_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)


In [None]:
# Save the vectorizer 
vectorizer_filename = '../models/tfidf_vectorizer.pkl'
with open(vectorizer_filename, 'wb') as file:
    pickle.dump(vectorizer, file)


In [None]:
import streamlit as st
import pickle

# Load the saved model and vectorizer from ../models/
model = pickle.load(open('../models/logistic_regression_model.pkl', 'rb'))
vectorizer = pickle.load(open('../models/tfidf_vectorizer.pkl', 'rb'))

st.title('Sentiment Analysis App')

# Input field for the user to enter text
user_input = st.text_area("Enter a review")

if st.button('Predict'):
    if user_input:
        # Transform the input text using the vectorizer
        transformed_input = vectorizer.transform([user_input])
        # Predict the sentiment
        prediction = model.predict(transformed_input)
        st.write(f'The predicted sentiment is: {prediction[0]}')
    else:
        st.write("Please enter a review to analyze.")


In [None]:
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine

# Load environment variables from .env file
load_dotenv()

# Get the DATABASE_URL from the environment
database_url = os.getenv('DATABASE_URL')

# Connect to the PostgreSQL database
engine = create_engine(database_url)


In [None]:
import os
from sqlalchemy import create_engine

# Get the DATABASE_URL from environment variables
database_url = os.getenv('DATABASE_URL')

# Create the SQLAlchemy engine for connecting to PostgreSQL
if database_url:
    engine = create_engine(database_url)

    try:
        connection = engine.connect()
        print("Connection to PostgreSQL successful!")
        connection.close()
    except Exception as e:
        print(f"Error connecting to the database: {e}")
else:
    print("DATABASE_URL environment variable not set.")
