<a href="https://colab.research.google.com/github/HiraZaheer/Python-Projects/blob/main/Final_Project_Hira.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import sys



In [4]:
# --------------- Load dataset ---------------
csv_path = "Hotel_Reviews.csv"  # change path if needed
df = pd.read_csv(csv_path)
print("Columns found:", list(df.columns))

# --------------- Build text + labels ---------------
def _clean_chunk(s):
    if pd.isna(s):
        return ""
    s = str(s).strip()
    if s.lower() in ("no positive", "no negative"):
        return ""
    return s

text_col = None
label_col = None
cols_lower = {c.lower(): c for c in df.columns}

if "review" in cols_lower:
    # Case A: small sample CSV you might use for quick testing
    text_col = cols_lower["review"]
    if "sentiment" in cols_lower:
        label_col = cols_lower["sentiment"]
    elif "reviewer_score" in cols_lower:
        label_col = "Sentiment"
        df[label_col] = df[cols_lower["reviewer_score"]].astype(float).apply(
            lambda x: "positive" if x > 6 else "negative"
        )
    else:
        sys.exit("Could not find 'Sentiment' or 'Reviewer_Score' to make labels.")
else:
    # Case B: Kaggle 515k Hotel_Reviews.csv
    needed = ["positive_review", "negative_review", "reviewer_score"]
    if not all(n in cols_lower for n in needed):
        sys.exit("Expected Kaggle columns 'Positive_Review', 'Negative_Review', 'Reviewer_Score'. Not all were found.")
    pos_col = cols_lower["positive_review"]
    neg_col = cols_lower["negative_review"]
    score_col = cols_lower["reviewer_score"]

    # Concatenate positive + negative parts (dropping 'No Positive'/'No Negative')
    df["Text"] = (df[pos_col].apply(_clean_chunk) + " " + df[neg_col].apply(_clean_chunk)).str.strip()
    df = df[df["Text"].str.len() > 0]  # drop empty texts
    text_col = "Text"

    # Label from score (0–10). Adjust threshold if desired (e.g., >7).
    df["Sentiment"] = df[score_col].astype(float).apply(lambda x: "positive" if x > 6 else "negative")
    label_col = "Sentiment"

# --------------- Split ---------------
X = df[text_col]
y = df[label_col]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y if y.nunique() == 2 else None
)

# --------------- Vectorize ---------------
vectorizer = TfidfVectorizer(stop_words="english", max_features=20000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# --------------- Train ---------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# --------------- Evaluate ---------------
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Columns found: ['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date', 'Average_Score', 'Hotel_Name', 'Reviewer_Nationality', 'Negative_Review', 'Review_Total_Negative_Word_Counts', 'Total_Number_of_Reviews', 'Positive_Review', 'Review_Total_Positive_Word_Counts', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score', 'Tags', 'days_since_review', 'lat', 'lng']
Accuracy: 0.9211549839477794

Classification Report:
               precision    recall  f1-score   support

    negative       0.71      0.39      0.50     10547
    positive       0.93      0.98      0.96     92554

    accuracy                           0.92    103101
   macro avg       0.82      0.68      0.73    103101
weighted avg       0.91      0.92      0.91    103101



In [8]:
# --------------- Demo ---------------

samples = [

    "The hotel was clean and the staff were very friendly.",
    "Terrible experience, the room was dirty and smelly.",
    "Excellent service and beautiful location!",
    "The food was bad and service was very slow.",
    "Rooms were spacious and well maintained.",
    "Very noisy at night, couldn’t sleep properly.",
    "Absolutely loved my stay here!",
    "The bathroom was unhygienic and water was leaking.",
    "Great breakfast buffet and comfortable beds.",
    "Not worth the price, very disappointing."
]
for s in samples:
     print(f"Review: {s}\nPredicted: {model.predict(vectorizer.transform([s]))[0]}\n")


Review: The hotel was clean and the staff were very friendly.
Predicted: positive

Review: Terrible experience, the room was dirty and smelly.
Predicted: negative

Review: Excellent service and beautiful location!
Predicted: positive

Review: The food was bad and service was very slow.
Predicted: negative

Review: Rooms were spacious and well maintained.
Predicted: positive

Review: Very noisy at night, couldn’t sleep properly.
Predicted: positive

Review: Absolutely loved my stay here!
Predicted: positive

Review: The bathroom was unhygienic and water was leaking.
Predicted: positive

Review: Great breakfast buffet and comfortable beds.
Predicted: positive

Review: Not worth the price, very disappointing.
Predicted: negative

