## ***MILIN SHARMA 23118045***

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score,  accuracy_score
import statsmodels.api as sm
from statsmodels.graphics.gofplots import qqplot
from statsmodels.stats.diagnostic import linear_rainbow

## Reading the dataset

In [None]:
df=pd.read_csv("tips (data for regression problem).csv")
df

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

# Extracting numerical columns into a separate dataframe

In [None]:
df_n = df.select_dtypes(include=['number'])
print(df_n)

## Converting the categorical columns into numerical using one hot encoding

In [None]:
df = pd.get_dummies(df, drop_first=True)
df

## Plotting various graphs to analyze the relationships

In [None]:
# 1. Scatter Plot Visualization
plt.figure(figsize=(10, 7))
sns.scatterplot(x='amount_billed', y='gratuity', data=df)
plt.title('Relationship between Bill and Tip Amount')
plt.xlabel('Amount of Bill')
plt.ylabel('Amount of Tip')
plt.show()

In [None]:
# 2. Pair Plot
sns.pairplot(df)
plt.show()

In [None]:
# 3. Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='Spectral')
plt.title('Correlation Matrix')
plt.show()

##  for Linearity - Rainbow Test

In [None]:
# Prepare data and scale features
X_data = df.drop(columns='tip')
y_target = df['tip']
scaler = StandardScaler()
X_scaled_data = scaler.fit_transform(X_data)
X_scaled_data = sm.add_constant(X_scaled_data)

# Fit OLS model
model_ols = sm.OLS(y_target, X_scaled_data).fit()

# Perform the Rainbow Test for model linearity
statistic, p_value = linear_rainbow(model_ols)
print(f"Rainbow Test Statistic: {statistic}")
print(f"p-value: {p_value}")

if p_value > 0.05:
    print("No evidence against linearity (p-value > 0.05). Linearity assumption holds.")
else:
    print("Evidence against linearity (p-value <= 0.05). Non-linear models may be needed.")

# Predict values
predicted_values = model_ols.predict(X_scaled_data)

# Plot Actual vs Predicted values
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_target, y=predicted_values)
plt.plot([y_target.min(), y_target.max()], [y_target.min(), y_target.max()], color='red', linestyle='--', lw=2)  # 45-degree line
plt.xlabel("Observed Values")
plt.ylabel("Predicted Values")
plt.title("Comparison of Actual and Predicted Values")
plt.show()

## Residuals plot to check for linearity

In [None]:
# Fit a linear regression model
reg_model = LinearRegression()
reg_model.fit(X_train_scaled, y_train)

# Make predictions on the test data
predicted_values_test = reg_model.predict(X_test_scaled)

# Calculate the residuals
residuals_test = y_test - predicted_values_test

# Plot the residuals
plt.figure(figsize=(10, 6))
sns.residplot(x=predicted_values_test, y=residuals_test, lowess=True, line_kws={'color': 'red', 'lw': 2})
plt.xlabel("Test Set Predicted Values")
plt.ylabel("Residuals")
plt.title("Residuals Plot to Assess Linearity")
plt.axhline(0, color='black', linestyle='--', lw=1)
plt.show()

## Line Plot (for time-series data) 

In [None]:
# Sort data by the total bill (used as a proxy for time series)
sorted_df = df.sort_values(by='total_bill')
plt.figure(figsize=(15, 6))
plt.plot(sorted_df['total_bill'], sorted_df['tip'], marker='o', linestyle='-', color='b')
plt.title("Tip Amounts vs. Total Bill")
plt.xlabel("Bill Amount")
plt.ylabel("Tip Given")
plt.show()

## **Performing regression tests to predict tip amount using all the features**

In [None]:
### Model Training and Performance Evaluation
evaluation_results = {}

# 1. Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
evaluation_results['Linear Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_linear),
    'R2 Score': r2_score(y_test, y_pred_linear)
}

# 2. Ridge Regression
ridge_regressor = Ridge(alpha=1.0)
ridge_regressor.fit(X_train, y_train)
y_pred_ridge = ridge_regressor.predict(X_test)
evaluation_results['Ridge Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_ridge),
    'R2 Score': r2_score(y_test, y_pred_ridge)
}

# 3. Lasso Regression
lasso_regressor = Lasso(alpha=0.1)
lasso_regressor.fit(X_train, y_train)
y_pred_lasso = lasso_regressor.predict(X_test)
evaluation_results['Lasso Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_lasso),
    'R2 Score': r2_score(y_test, y_pred_lasso)
}

# 4. Decision Tree Regressor
decision_tree_model = DecisionTreeRegressor()
decision_tree_model.fit(X_train, y_train)
y_pred_dt = decision_tree_model.predict(X_test)
evaluation_results['Decision Tree Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_dt),
    'R2 Score': r2_score(y_test, y_pred_dt)
}

# 5. Random Forest Regressor
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)
y_pred_rf = random_forest_model.predict(X_test)
evaluation_results['Random Forest Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_rf),
    'R2 Score': r2_score(y_test, y_pred_rf)
}

# 6. Support Vector Regression (SVR)
svr_regressor = SVR()
svr_regressor.fit(X_train, y_train)
y_pred_svr = svr_regressor.predict(X_test)
evaluation_results['Support Vector Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_svr),
    'R2 Score': r2_score(y_test, y_pred_svr)
}

# 7. K-Nearest Neighbors (KNN) Regression
knn_regressor = KNeighborsRegressor(n_neighbors=5)
knn_regressor.fit(X_train, y_train)
y_pred_knn = knn_regressor.predict(X_test)
evaluation_results['K-Nearest Neighbors Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_knn),
    'R2 Score': r2_score(y_test, y_pred_knn)
}

# Print evaluation results
for model_name, metrics in evaluation_results.items():
    print(f"{model_name}: MSE = {metrics['MSE']:.2f}, R2 Score = {metrics['R2 Score']:.2f}")

### SVR seems to be the most promising model, followed by Lasso Regression because of low MSE and high R2 Score.

### **Next we can perform feature enginnering to enhance the accuracy of model by selecting different types of columns**
### -   Using only numerical columns

In [None]:
# Split data into features and target variable
X = df_n.drop('tip', axis=1)  
y = df_n['tip']               

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
### Model Training and Performance Evaluation
model_results = {}

# 1. Linear Regression
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)
y_pred_linear = linear_regressor.predict(X_test)
model_results['Linear Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_linear),
    'R2 Score': r2_score(y_test, y_pred_linear)
}

# 2. Ridge Regression
ridge_regressor = Ridge(alpha=1.0)
ridge_regressor.fit(X_train, y_train)
y_pred_ridge = ridge_regressor.predict(X_test)
model_results['Ridge Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_ridge),
    'R2 Score': r2_score(y_test, y_pred_ridge)
}

# 3. Lasso Regression
lasso_regressor = Lasso(alpha=0.1)
lasso_regressor.fit(X_train, y_train)
y_pred_lasso = lasso_regressor.predict(X_test)
model_results['Lasso Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_lasso),
    'R2 Score': r2_score(y_test, y_pred_lasso)
}

# 4. Decision Tree Regressor
decision_tree = DecisionTreeRegressor()
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_test)
model_results['Decision Tree Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_dt),
    'R2 Score': r2_score(y_test, y_pred_dt)
}

# 5. Random Forest Regressor
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
model_results['Random Forest Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_rf),
    'R2 Score': r2_score(y_test, y_pred_rf)
}

# 6. Support Vector Regression (SVR)
svr_regressor = SVR()
svr_regressor.fit(X_train, y_train)
y_pred_svr = svr_regressor.predict(X_test)
model_results['Support Vector Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_svr),
    'R2 Score': r2_score(y_test, y_pred_svr)
}

# 7. K-Nearest Neighbors (KNN) Regression
knn_regressor = KNeighborsRegressor(n_neighbors=5)
knn_regressor.fit(X_train, y_train)
y_pred_knn = knn_regressor.predict(X_test)
model_results['K-Nearest Neighbors Regression'] = {
    'MSE': mean_squared_error(y_test, y_pred_knn),
    'R2 Score': r2_score(y_test, y_pred_knn)
}

# Display model evaluation results
for model_name, metrics in model_results.items():
    print(f"{model_name}: MSE = {metrics['MSE']:.2f}, R2 Score = {metrics['R2 Score']:.2f}")

## here also, when only numerical columns are used SVR seems to be the most promising model, followed by Lasso Regression because of low MSE and high R2 Score.

## **Predicting the feature importance**

In [None]:
from sklearn.preprocessing import StandardScaler
X_data = df.drop(columns='tip')
y_target = df['tip']

# Split data into training and testing sets
X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(X_data, y_target, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_data)
X_test_scaled = scaler.transform(X_test_data)

# Fit a Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train_data)

# Extract coefficients and feature names
coefficients = linear_model.coef_
features = X_data.columns

# Create a DataFrame to show feature importance
importance_df = pd.DataFrame({
    'Feature': features,
    'Coefficient': coefficients
})

# Add a column for the absolute value of coefficients to measure importance
importance_df['Abs_Coefficient'] = np.abs(importance_df['Coefficient'])
importance_df = importance_df.sort_values(by='Abs_Coefficient', ascending=False)

# Print feature importance based on the magnitude of the coefficients
print("Feature Importance (based on coefficient magnitude):")
print(importance_df[['Feature', 'Coefficient']])

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Abs_Coefficient', y='Feature', data=importance_df)
plt.title("Feature Importance Based on Linear Regression Coefficients")
plt.show()

## As we can see the total bill column is most important feature to predict the tip, thus we can separately use it to build a new model to improve the speed and accuracy

## **Using only total bill column to predict the tip**

In [None]:
from sklearn.preprocessing import StandardScaler
X_data = df.drop(columns='tip')
y_target = df['tip']

# Split data into training and testing sets
X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(X_data, y_target, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_data)
X_test_scaled = scaler.transform(X_test_data)

# Fit a Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train_data)

# Extract coefficients and feature names
coefficients = linear_model.coef_
features = X_data.columns

# Create a DataFrame to show feature importance
importance_df = pd.DataFrame({
    'Feature': features,
    'Coefficient': coefficients
})

# Add a column for the absolute value of coefficients to measure importance
importance_df['Abs_Coefficient'] = np.abs(importance_df['Coefficient'])
importance_df = importance_df.sort_values(by='Abs_Coefficient', ascending=False)

# Print feature importance based on the magnitude of the coefficients
print("Feature Importance (based on coefficient magnitude):")
print(importance_df[['Feature', 'Coefficient']])

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Abs_Coefficient', y='Feature', data=importance_df)
plt.title("Feature Importance Based on Linear Regression Coefficients")
plt.show()

## **Model Performance with Only Total Bill Feature**

When only the **Total Bill** column is used as a feature, both the **Mean Squared Error (MSE)** decreases and the **R-squared (R²)** score increases. Among all the models tested, **Lasso Regression** performs the best in this scenario, demonstrating the most accurate predictions with the fewest features.


## **Conclusion**

### **Factors Affecting Tip Amounts:**
- Several models were trained using different sets of features. The feature importance was assessed using regression algorithms. Among all the features, **Total Bill** had the strongest impact on the tip amount, followed by **Group Size** (the number of people dining together).

### **Prediction Accuracy:**
- **Support Vector Regression (SVR)** provided the highest R-squared value of 0.57 and consistently delivered the best results in terms of **Mean Squared Error (MSE)**.
- **Lasso Regression** produced good results when only the **Total Bill** feature was used, indicating that it captures the relationship well with minimal variables.
- The other models performed reasonably well; however, the **MSE** for all models did not drop below 0.30, suggesting that there is still room for improvement in the predictive accuracy.

### **Insights for Management:**
- The **Total Bill** has a significant influence on the tip amount. Therefore, encouraging **premium customers** to visit more frequently could lead to an increase in tips. This can be achieved by offering **special promotions**, such as free drinks or loyalty rewards.
- Offering **premium dishes** of high quality at a moderate price point could also drive higher tip amounts, as these would likely increase the overall total bill, positively influencing the tip.
