In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the dataset with a different encoding
df = pd.read_csv('Reviews.csv', encoding='latin1')


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        # Remove HTML tags
        text = re.sub('<[^<]+?>', ' ', text)
        # Remove non-alphanumeric characters and keep spaces
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        # Join tokens back into a string
        preprocessed_text = ' '.join(tokens)
        return preprocessed_text
    else:
        return ''  # Return empty string if the input is not a string


# Preprocess the 'Text' column
df['Preprocessed_Text'] = df['Text'].apply(preprocess_text)

# Print the preprocessed text
print(df['Preprocessed_Text'].head(10))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the number of features as needed

# Fit and transform the preprocessed text data
X_tfidf = tfidf_vectorizer.fit_transform(df['Preprocessed_Text'])

# Print the shape of the TF-IDF matrix
print("Shape of TF-IDF matrix:", X_tfidf.shape)


In [None]:
# Optional: Print the feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
print("Feature names:", feature_names)

In [None]:
# Drop rows with missing values in any column
df.dropna(inplace=True)

# Re-extract features and target variable after dropping missing values
X_tfidf = tfidf_vectorizer.fit_transform(df['Preprocessed_Text'])
y = df['Score']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Initialize logistic regression model
logistic_regression_model = LogisticRegression()

# Train the model
logistic_regression_model.fit(X_train, y_train)

# Predictions
y_pred_train = logistic_regression_model.predict(X_train)
y_pred_test = logistic_regression_model.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

# Classification report
print("Classification Report for Test Data:")
print(classification_report(y_test, y_pred_test))


In [None]:
# Example text
example_text = "This product exceeded my expectations. It works perfectly and is worth every penny."

# Preprocess the example text
preprocessed_example_text = preprocess_text(example_text)

# Vectorize the preprocessed example text
example_text_vectorized = tfidf_vectorizer.transform([preprocessed_example_text])

# Predict sentiment
predicted_sentiment = logistic_regression_model.predict(example_text_vectorized)

# Print the predicted sentiment
print("Predicted Sentiment:", predicted_sentiment)


In [None]:
import random

# Choose random examples from the dataset
random.seed(42)  # For reproducibility
random_indices = random.sample(range(len(df)), 5)  # Select 5 random indices
random_examples = df.iloc[random_indices]

# Predict sentiments for the random examples
for index, row in random_examples.iterrows():
    example_text = row['Text']
    preprocessed_example_text = preprocess_text(example_text)
    example_text_vectorized = tfidf_vectorizer.transform([preprocessed_example_text])
    predicted_sentiment = logistic_regression_model.predict(example_text_vectorized)[0]
    actual_sentiment = row['Score']
    print("Example Text:", example_text)
    print("Actual Sentiment:", actual_sentiment)
    print("Predicted Sentiment:", predicted_sentiment)
    print("-------------------------------")
