In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("Travel.csv")
df.head()
df.info()
df.describe()
df.shape
df.drop_duplicates(inplace=True)
df.duplicated().sum()

In [None]:
missing = df.isnull().mean()
missing.sort_values(ascending=False)*100

df['Embarked'].fillna(df.Embarked.mode()[0], inplace=True)

df['Age'] = df['Age'].fillna(df['Age'].median())
#df['Age'] = df['Age'].replace(0,df['Age'].mean())


df.drop('col',axis=1,inplace=True)
df.columns

In [None]:
# AutoEDA
!pip install pydantic-settings
from ydata_profiling import ProfileReport as pp
profile = pp(df, title="Data Profile Report", explorative=True)
profile.to_file("data_profile_final_raw.html")
print("The profiling report has been generated!")

In [None]:
#Outlier
sns.boxplot(df)
def handle_outliers_iqr(df):
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            q1 = df[col].quantile(0.25)
            q3 = df[col].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            df[col] = np.clip(df[col], lower_bound, upper_bound)
    return df
df2 = handle_outliers_iqr(df)
sns.boxplot(df2)

In [None]:
# prompt: eda for datatime feature in dataset

# Convert 'date_time' column to datetime objects
df['date_time'] = pd.to_datetime(df['date_time'])

# Extract features from the datetime column
df['year'] = df['date_time'].dt.year
df['month'] = df['date_time'].dt.month
df['day'] = df['date_time'].dt.day

# Analyze the extracted features
# Example: Plot the traffic volume by hour of day
plt.figure(figsize=(12, 6))
sns.boxplot(x='hour', y='traffic_volume', data=df)
plt.title('Traffic Volume by Hour of Day')
plt.show()


In [None]:
# CORRELATION MATRIX
plt.figure(figsize=(10,7))
sns.heatmap(df.corr(),annot=True)
plt.title("Correlation between the columns")
plt.show()

In [None]:
# splitting into num and categorical
num_cols = df.select_dtypes(include=np.number).columns
cat_cols = df.select_dtypes(exclude=np.number).columns
num_cols, cat_cols

# pie chart for categorical

for col in cat_cols:
    plt.figure(figsize=(10,5))
    plt.pie(df[col].value_counts().values, labels=df[col].value_counts().index, autopct='%1.1f%%', startangle=90)
    plt.title(col)
    plt.show()

for col in num_cols:
    plt.figure(figsize=(10,5))
    sns.histplot(df[col], kde=True)
    plt.title(col)
    plt.xlim(0, 450)
    plt.show()

# aur directly use pairplot

sns.pairplot(df)

sns.regplot(x = 'Age', y = 'Fare', data = df)

In [None]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder,LabelEncoder
le = LabelEncoder()
df['weather_main'] = le.fit_transform(df['weather_main'])
df['weather_description']= le.fit_transform(df['weather_description'])

# Perform one-hot encoding
df = pd.get_dummies(df, columns=['weather_main', 'weather_description'], prefix=['weather_main', 'weather_description'],drop_first = True)
print(df.head())

dict = {'S' : 1, 'C': 2, 'Q' : 3}
df['Embarked'] = df['Embarked'].map(dict)

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV

X = df.drop(['traffic_volume','date_time'], axis=1)
y = df['traffic_volume']
X.shape,y.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

models_and_params = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {
            'fit_intercept': [True, False],
        }
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [50, 100, 150, 200, 300],
            'max_depth': [None, 5, 10, 15, 20],
        }
    },
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [None, 5, 10, 15, 20],
            'min_samples_split': [2, 5, 10, 15],
        }
    },
    'XGBRegressor': {
        'model': XGBRegressor(eval_metric='rmse'),
        'params': {
            'n_estimators': [50, 100, 150, 200],
            'max_depth': [3, 5, 7, 9],
            'learning_rate': [0.01, 0.1, 0.2, 0.3]
        }
    },
    'AdaBoostRegressor': {
        'model': AdaBoostRegressor(),
        'params': {
            'n_estimators': [50, 100, 200, 300],
            'learning_rate': [0.1, 0.5, 1, 2]
        }
    }
}

results = []

for model_name, model_info in models_and_params.items():
    model = model_info['model']
    params = model_info['params']
    grid = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)
    results.append({
        'model': model_name,
        'best_params': grid.best_params_,
        'best_score': grid.best_score_,
        'mse': mse,
        'rmse': rmse,
        'r2': r2,
    })
results_df = pd.DataFrame(results)
results_df = results_df.round({'best_score': 2, 'mse': 2, 'rmse': 2, 'r2': 2})
print(results_df[['model', 'best_params', 'best_score', 'mse', 'rmse', 'r2']])

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay,precision_score, recall_score, f1_score, roc_auc_score,roc_curve


models_and_params = {
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100, 150, 200, 300],
            'max_depth': [None, 5, 10, 15, 20],
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy']
            'max_depth': [None, 5, 10, 15, 20],
            'min_samples_split': [2, 5, 10, 15],
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(solver='liblinear'),
        'params': {
            'penalty': ['l1', 'l2']
            'C': [0.01, 0.1, 1, 10, 100],
            'max_iter': [100, 200, 300, 500]
        }
    },
    'XGBClassifier': {
        'model': XGBClassifier(eval_metric='mlogloss'),
        'params': {
            'n_estimators': [50, 100, 150, 200],
            'max_depth': [3, 5, 7, 9],
            'learning_rate': [0.01, 0.1, 0.2, 0.3]
        }
    },
    'AdaBoostClassifier': {
        'model': AdaBoostClassifier(),
        'params': {
            'n_estimators': [50, 100, 200, 300],
            'learning_rate': [0.1, 0.5, 1, 2]
        }
    }
}

results = []

for model_name, model_info in models_and_params.items():
    model = model_info['model']
    params = model_info['params']
    grid = GridSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)  # Train on the training set
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    results.append({
        'model': model_name,
        'best_params': grid.best_params_,
        'best_score': grid.best_score_,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    })
results_df = pd.DataFrame(results)
results_df = results_df.round({'best_score': 2, 'accuracy': 2, 'precision': 2, 'recall': 2, 'f1': 2})
print(results_df[['model', 'best_params', 'best_score', 'accuracy', 'precision', 'recall', 'f1']])


In [None]:
Rf_model=RandomForestClassifier()
Rf_model.fit(X_train,y_train)
y_pred_rf=Rf_model.predict(X_test)
print(accuracy_score(y_test,y_pred_rf))
# we are tuning three hyperparameters right now, we are passing the different values for both parameters
'''grid_param = {
    "n_estimators" : [90,100,115,130],
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,20,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'max_features' : ['auto','log2']
}

grid_searh=GridSearchCV(estimator=Rf_model,param_grid=grid_param,cv=3,verbose=2,n_jobs=-1)
grid_searh.fit(X_train,y_train)
print(grid_searh.best_params_)
y_pred=grid_searh.predict(X_test)
accuracy_score(y_test,y_pred)'''

In [None]:
df = pd.read_csv('/content/train.mnist.csv')
X = df.iloc[:, 1:]
y = df.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create an imputer to fill NaN values with the mean of each column
imputer = SimpleImputer(strategy='mean') # You can use other strategies like 'median' or 'most_frequent'

# Fit the imputer to your training data and transform it
X_train = imputer.fit_transform(X_train)

# Transform the test data using the fitted imputer
X_test = imputer.transform(X_test)

# Now, you can proceed with training your model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

import plotly.express as px
y_train_scaled = y_train.astype(str)
fig = px.scatter_3d(df, x=X_train_pca[:,0],
                    y=X_train_pca[:,1],
                    z=X_train_pca[:,2],
                    color=y_train_scaled)
fig.update_layout(margin= dict(l=20, r=20, b=20, t=20),
                  paper_bgcolor='LightSteelBlue')
fig.show()

pca.explained_variance_    #variance of the components PCA1, PCA2, PCA3
pca.explained_variance_ratio_    # atleast 80 % required
np.cumsum(pca.explained_variance_ratio_)          # cumulative sum
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

label = LabelEncoder()
y = label.fit_transform(y)

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

kmeans = KMeans(n_clusters = 2, random_state = 42)
kmeans.fit(X)
kmeans.cluster_centers_
kmeans.labels_
kmeans.inertia_
silhouette_score(X, kmeans.labels_)


inertia =[]
for k in range(1,11):
  kmeans = KMeans(n_clusters = k, random_state = 42)
  kmeans.fit(X)
  inertia.append(kmeans.inertia_)

plt.plot(range(1,11),inertia,marker='o', linestyle='--')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()


kmeans = KMeans(n_clusters = 3, random_state = 42)
kmeans.fit(X)
labels = kmeans.labels_

correct_labels = sum(y== labels)
total_labels = len(labels)
accuracy = correct_labels/total_labels
print(accuracy)

print("Result %d out of %d samples corectly labelled" % (correct_labels, y.size))

print('Accuracy score: ', format(correct_labels/float(y.size)))
