# Zomato Data Analysis using Advanced CNN Neural Network

This project implements an advanced Convolutional Neural Network (CNN) with 5 hidden layers to analyze the Zomato dataset. We'll use various Python libraries including TensorFlow, Keras, PySpark, and visualization tools to build and evaluate our model.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import pickle
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("ZomatoAnalysis") \
    .getOrCreate()

# Load data using pandas
df = pd.read_csv('data/zomato.csv')

# Convert to PySpark DataFrame
spark_df = spark.createDataFrame(df)

# Display basic information about the dataset
print("Dataset Info:")
print("-" * 50)
print(f"Number of rows: {spark_df.count()}")
print(f"Number of columns: {len(spark_df.columns)}")
print("\nColumn Names:")
print(spark_df.columns)

# Exploratory Data Analysis

Let's analyze our dataset through various visualizations to understand the patterns and relationships between different features.

In [None]:
# Set up the plotting style
plt.style.use('seaborn')
fig, axes = plt.subplots(2, 2, figsize=(20, 16))

# 1. Distribution of ratings
sns.histplot(data=df, x='rating', bins=20, ax=axes[0,0])
axes[0,0].set_title('Distribution of Ratings')

# 2. Average cost for two vs ratings
sns.scatterplot(data=df, x='rating', y='average_cost_for_two', ax=axes[0,1])
axes[0,1].set_title('Average Cost vs Ratings')

# 3. Cuisine distribution (top 10)
cuisine_counts = df['cuisines'].value_counts().head(10)
sns.barplot(x=cuisine_counts.values, y=cuisine_counts.index, ax=axes[1,0])
axes[1,0].set_title('Top 10 Cuisines')

# 4. Online delivery availability
delivery_counts = df['has_online_delivery'].value_counts()
sns.pieplot(delivery_counts.values, labels=delivery_counts.index, ax=axes[1,1])
axes[1,1].set_title('Online Delivery Availability')

plt.tight_layout()
plt.show()

# Data Preprocessing and Feature Engineering

Now we'll prepare our data for the CNN model by:
1. Handling missing values
2. Encoding categorical variables
3. Scaling numerical features
4. Reshaping data for CNN input

In [None]:
# Handle missing values
df = df.fillna(0)

# Encode categorical variables using PySpark
categorical_columns = ['cuisines', 'location', 'rest_type', 'type']
for col in categorical_columns:
    indexer = StringIndexer(inputCol=col, outputCol=f"{col}_index")
    encoder = OneHotEncoder(inputCols=[f"{col}_index"], outputCols=[f"{col}_encoded"])
    spark_df = indexer.fit(spark_df).transform(spark_df)
    spark_df = encoder.fit(spark_df).transform(spark_df)

# Convert back to pandas for further processing
df_processed = spark_df.toPandas()

# Scale numerical features
scaler = StandardScaler()
numerical_columns = ['votes', 'average_cost_for_two']
df_processed[numerical_columns] = scaler.fit_transform(df_processed[numerical_columns])

# Prepare features for CNN
X = df_processed.drop(['rating'], axis=1)
y = df_processed['rating']

# Reshape data for CNN (samples, timesteps, features)
X = X.values.reshape(X.shape[0], X.shape[1], 1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

# CNN Model Architecture

We'll create a CNN model with 5 hidden layers using different activation functions:
1. Conv1D layer with ReLU activation
2. Dense layer with tanh activation
3. Dense layer with sigmoid activation
4. Dense layer with softmax activation
5. Dense layer with linear activation

The model will use binary cross-entropy as the loss function.

In [None]:
# Define the CNN model
def create_cnn_model(input_shape):
    model = Sequential([
        # 1. Conv1D layer with ReLU activation
        Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape),
        MaxPooling1D(pool_size=2),
        
        # Flatten layer to connect Conv1D to Dense layers
        Flatten(),
        
        # 2. Dense layer with tanh activation
        Dense(128, activation='tanh'),
        Dropout(0.3),
        
        # 3. Dense layer with sigmoid activation
        Dense(64, activation='sigmoid'),
        Dropout(0.3),
        
        # 4. Dense layer with softmax activation
        Dense(32, activation='softmax'),
        Dropout(0.3),
        
        # 5. Dense layer with linear activation
        Dense(16, activation='linear'),
        Dropout(0.3),
        
        # Output layer
        Dense(1)
    ])
    
    return model

# Create and compile the model
model = create_cnn_model(input_shape=(X_train.shape[1], 1))
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['mae', 'mse']
)

# Display model summary
model.summary()

# Model Training

We'll train the model with:
- Early stopping to prevent overfitting
- Model checkpoint to save the best model
- Batch size of 32
- 100 epochs

In [None]:
# Define callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

model_checkpoint = ModelCheckpoint(
    'model/best_model.h5',
    monitor='val_loss',
    save_best_only=True
)

# Train the model
history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[early_stopping, model_checkpoint]
)

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Training MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.title('Model MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()

plt.tight_layout()
plt.show()

# Model Evaluation and Predictions

Let's evaluate our model's performance on the test set and make predictions.

In [None]:
# Evaluate model on test set
test_loss, test_mae, test_mse = model.evaluate(X_test, y_test)
print(f"\nTest Loss: {test_loss:.4f}")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test RMSE: {np.sqrt(test_mse):.4f}")

# Make predictions
y_pred = model.predict(X_test)

# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.title('Actual vs Predicted Ratings')
plt.tight_layout()
plt.show()

# Calculate additional metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print("\nAdditional Metrics:")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

# Save Model and Scaler

Finally, we'll save our trained model and scaler for future use.

In [None]:
# Save the model
model.save('model/zomato_cnn_model.h5')

# Save the scaler
with open('model/zomato_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Model and scaler saved successfully!")

# Function to load the model and make predictions
def load_model_and_predict(data):
    """
    Load the saved model and scaler to make predictions on new data
    """
    # Load the model
    loaded_model = tf.keras.models.load_model('model/zomato_cnn_model.h5')
    
    # Load the scaler
    with open('model/zomato_scaler.pkl', 'rb') as f:
        loaded_scaler = pickle.load(f)
    
    # Preprocess the data
    scaled_data = loaded_scaler.transform(data[numerical_columns])
    
    # Reshape for CNN
    reshaped_data = scaled_data.reshape(scaled_data.shape[0], scaled_data.shape[1], 1)
    
    # Make predictions
    predictions = loaded_model.predict(reshaped_data)
    
    return predictions

print("Example usage of the prediction function:")
sample_data = df[numerical_columns].head(1)
print("\nSample input:")
print(sample_data)
print("\nPredicted rating:", load_model_and_predict(sample_data)[0][0])