In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


In [23]:
df = pd.read_csv('data/processed_data_team_29.csv')

We imported our dataframe and all the needed libraries. Now it is time to preprocess the data

In [24]:

# Threshold
threshold = 3.5 
df['rating_class'] = (df['rating'] > threshold).astype(int)
X = df.drop(columns=['name', 'rating', 'rating_class'])  # Drop non-feature columns
y = df['rating_class']  # Target variable (higher or lower)


In [25]:

# Encoding the 'types' column
label_encoder = LabelEncoder()
X['types'] = label_encoder.fit_transform(X['types'])

# Encoding the 'has_website' column 
X['has_website'] = X['has_website'].astype(int)  # Convert boolean to 0 and 1


In [26]:

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [27]:

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['price_level', 'latitude', 'longitude']),  # Scaling numerical columns
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Imputing missing values
            ('scaler', StandardScaler())  # Scaling categorical features (after encoding)
        ]), ['types', 'has_website'])  # Including has_website as a feature
    ])


In [28]:

# Create a pipeline with the preprocessor and Random Forest classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])


In [29]:

# Training the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)



We finished preprocessing the data. Now it is time to create the model.

In [30]:

# Calculate classification metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate accuracy score
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred)

Classification Report:
              precision    recall  f1-score   support

           0       0.22      0.15      0.18        13
           1       0.92      0.95      0.93       133

    accuracy                           0.88       146
   macro avg       0.57      0.55      0.56       146
weighted avg       0.86      0.88      0.87       146

Accuracy: 0.8767
