In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor


In [None]:
df=pd.read_csv('Clean_Dataset.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.isnull().sum()

# Data Cleaning

In [None]:
df.drop(columns='Unnamed: 0',axis=1,inplace=True)#['']

In [None]:
df.head()

In [None]:
df.info()

In [None]:
count=df.airline.value_counts()

In [None]:
df.airline.unique()

In [None]:
count

In [None]:
percentage=df.airline.value_counts(normalize=True)*100#If True then the object returned will contain the relative
    #frequencies of the unique values.frequncy/total

In [None]:
percentage

In [None]:
freq_table=pd.DataFrame({'Frequency':count,'percent':percentage})

In [None]:
freq_table

In [None]:
plt.figure(figsize=(10,6))
ax=sns.barplot(x=freq_table.index, y="Frequency", data=freq_table, palette="viridis")
plt.xticks(rotation=45, ha='right', fontsize=12)

plt.xlabel("Airline", fontsize=14)

plt.ylabel("Frequency", fontsize=14)

plt.title("Airline", fontsize=16)

plt.tight_layout()

plt.show()

In [None]:
sns.countplot(df['airline'])

In [None]:
df4=df.groupby('airline')['price'].count().sort_values(ascending=False)

In [None]:
df4

In [None]:
plt.figure(figsize=(10,6))
ax=sns.barplot(x=df4.index, y=df4.values, palette="viridis")
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.xlabel("Airline", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.title("Airline", fontsize=16)
plt.tight_layout()
plt.show()

### vistara plane is most used and spicejet is less used

In [None]:
count_2=df['source_city'].value_counts()

In [None]:
sns.barplot(y=count_2.index,x=count_2,palette='viridis')

plt.title('Source City',fontsize=18,fontweight='600')

plt.xlabel('Count',fontsize=18)

plt.ylabel('City',fontsize=18)

plt.show()

### Delhi is the main source city.chennai is less used 

In [None]:
counts = df['departure_time'].value_counts()
sns.barplot(x=counts.index,y=counts,palette='viridis')

### Main Departure Time is Morning.most of the flight trip are in the morning

In [None]:
counts = df['arrival_time'].value_counts()

sns.barplot(y=counts.index,x=counts,palette='viridis')

plt.title('Arrival Time',fontsize=18,fontweight='600')

plt.xlabel('Count',fontsize=18)

plt.ylabel('Time',fontsize=18)

plt.show()

### Arrival Time is Night

In [None]:
counts = df['stops'].value_counts()
counts

In [None]:
plt.figure(figsize=(8,8))
plt.pie(counts, labels=['One Stop','Zero Stops','Two Or More'], autopct='%1.1f%%',startangle=140,wedgeprops=dict(width=0.3))  
plt.title('Stops',fontsize=18)
plt.show()

In [None]:
counts = df['arrival_time'].value_counts()

sns.barplot(y=counts.index,x=counts,palette='viridis')

plt.title('Arrival Time',fontsize=18,fontweight='600')

plt.xlabel('Count',fontsize=18)

plt.ylabel('Time',fontsize=18)

plt.show()

In [None]:
counts = df['destination_city'].value_counts()

sns.barplot(y=counts.index,x=counts,palette='viridis')

plt.title('Destination',fontsize=18,fontweight='600')

plt.xlabel('Count',fontsize=18)

plt.ylabel('City',fontsize=18)

plt.show()

In [None]:
counts = df['class'].value_counts()

sns.barplot(x=counts.index,y=counts,palette='viridis')

plt.title('Destination',fontsize=18,fontweight='600')

plt.ylabel('Count',fontsize=18)

plt.xlabel('City',fontsize=18)

plt.show()

In [None]:
df.duration.describe()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['duration'],bins=30,kde=True)
#A histogram shows the distribution of data by grouping values into bins (intervals) 
#and counting how many data points fall into each bin.
#if you’re looking at durations of phone calls in minutes, a histogram would show how many calls 
#lasted 0–2 minutes, 2–4 minutes, 4–6 minutes, and so on.
##########################################################################################
# Bins are the intervals (or "buckets") into which the data is grouped.
# Smaller bins give more detail but may look noisy.
# Larger bins summarize the data more but may lose important patterns.
##########################################################################################
# KDE stands for Kernel Density Estimate. 
# It adds a smooth curve on top of the histogram to represent the probability density of the data.
# overall shape of the data
################################################################################################
plt.title('Trip Duration')

plt.xlabel('Hours')

plt.ylabel('Count')

plt.show()

In [None]:
df['days_left'].unique()

In [None]:
df['days_left'].describe()
#A KDE plot (Kernel Density Estimate plot) is a smooth curve that shows the probability density of a dataset.
# It provides a visual representation of where the data points are 
# concentrated and is an alternative to a histogram for understanding the distribution of data.

In [None]:
#Histograms can look blocky because they depend on the number of bins.
#KDE smooths out the data, providing a continuous curve for better visualizing patterns or trends.
#Kernel Function:

#The kernel is a mathematical function (e.g., Gaussian, which looks like a bell curve).
#It’s applied to each data point to create a small “bump” around it.

In [None]:
plt.figure(figsize=(10,6))
sns.kdeplot(df['days_left'],fill=True,color='skyblue',alpha=0.5)
plt.title("Days Left for the Trip")

plt.xlabel("Days")

plt.ylabel("Density")

plt.show()

# BOXPLOT

In [None]:
sns.boxplot(df)

In [None]:
sns.boxplot(df.price)

In [None]:
def removal_outlier(df,column):
    sns.boxplot(df[column])
    plt.title(f"Orginal box plot{column}")
    plt.show()
    Q1,Q3=np.percentile(df[column],[25,75],method='midpoint')
    IQR=Q3-Q1
    lower=Q1-1.5*IQR
    upper=Q3+1.5*IQR
    remove_outlier=df[(df[column]>=lower) & (df[column]<=upper)]
    sns.boxplot(remove_outlier[column])
    plt.title(f"Orginal box plot{column}")
    plt.show()
    df[column]=df[column].apply(lambda x:x if lower<=x<=upper else None)
    

no_outlier=removal_outlier(df,'price')

# df['price']=df['price'].apply(lambda x:x if lower<=x<=upper else None)
    

In [None]:
sns.boxplot(df['price'])

#iqr=q3-q1
Q1,Q3=np.percentile(df['price'],[25,75],method='midpoint')
Q1
IQR=Q3-Q1
ower=Q1-1.5IQR 
upper=Q3+1.5IQR
upper_array=np.array(df['price']>=upper) 
lower_array=np.array(df['price']<=lower)
df['price']=df['price'].apply(lambda x:x if lower<=x<=upper else None)

# Data Preprocessing¶

In [None]:
df.head()

In [None]:
df.drop('flight',axis=1,inplace=True)

# Enocoding

In [None]:
num_fea=[fea for fea in df.columns if df[fea].dtypes!='object']

In [None]:
df[num_fea]

lab=LabelEncoder()
for col in cat_fea:
    df[col]=lab.fit_transform(df[col])

In [None]:

correlation_matrix=df[num_fea].corr()
plt.figure(figsize=(10,8))
sns.heatmap(correlation_matrix,annot =True,cmap='coolwarm',fmt='.2f',linewidths=0.5)
plt.title('Correlation')
plt.show()

In [None]:
# find the relation table- hypothesis

In [None]:
cat_fea=[fea for fea in df.columns if df[fea].dtypes=='object']

for c1 in cat_fea:
    for c2 in cat_fea:
        dataset_table=pd.crosstab(df[c1],df[c2])
        observed_values=dataset_table.values
        print("observed values:\n",observed_values)
    
    no_of_rows=len(dataset_table.iloc[0:2,0])
    no_of_column=len(dataset_table.iloc[0,0:2])
    ddof=(no_of_rows-1)*(no_of_column-1)
    print("Degree of freedom:-",ddof)
    alpha=0.05
    from scipy.stats import chi2
    critical_value=chi2.ppf(q=1-alpha,df=ddof) # 95% confidence, 1 degrees of freedom
    print("critical_value:",critical_value)
    val=stats.chi2_contingency(dataset_table)
    from scipy.stats import chi2#chi sqaure statistics
    chi2_sqr=sum([(o-e)**2/e for o,e in zip(observed_values,Expected_values)])
    chi_sqr_statistic=chi2_sqr[0]+chi2_sqr[1]
    chi_sqr_statistic
    p_value=1-chi2.cdf(x=chi_sqr_statistic,df=ddof)
    print('pvalue:',p_value)
    print("significant level:",alpha)
    print("degree of freedom:",ddof)

In [None]:
lab=LabelEncoder() 
for col in cat_fea: 
    df[col]=lab.fit_transform(df[col])

In [None]:

correlation_matrix=df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(correlation_matrix,annot =True,cmap='coolwarm',fmt='.2f',linewidths=0.5)
plt.title('Correlation')
plt.show()

In [None]:
df.isnull().sum()

# Feature and target

In [None]:
X = df.drop(columns='price')

y = df['price'] 

In [None]:
y

In [None]:
y.isnull().sum()

In [None]:
y=y.fillna(y.mean())

In [None]:
y.isnull().sum()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True,random_state=42)

In [None]:
model_1=LinearRegression()
model_1.fit(X_train,y_train)
y_pred=model_1.predict(X_test)

In [None]:
X_train.shape[0]

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Define model
model = Ridge()

# Define GridSearchCV with a regression-appropriate scoring metric
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error')

# Fit the grid search on your data
grid_search.fit(X_train, y_train)

# Access the best parameters
print("Best parameters:", grid_search.best_params_)


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [3, 5, 10, 20],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],#p: Power parameter for Minkowski distance (e.g., 1 for Manhattan, 2 for Euclidean).
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']#algorithm: Algorithm to compute nearest neighbors (['auto', 'ball_tree', 'kd_tree', 'brute']).
}

grid_search = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best Parameters
print("Best parameters:", grid_search.best_params_)


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'criterion': ['squared_error', 'absolute_error']
}

grid_search = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=param_grid, cv=10, scoring='accuracy_score')
grid_search.fit(X_train, y_train)

# Best Parameters
print("Best parameters:", grid_search.best_params_)


C (Regularization parameter): Controls the trade-off between achieving a low error on the training data and minimizing model complexity.

Typical values: [0.1, 1, 10, 100, 1000]
epsilon: Defines a margin of tolerance where predictions are considered acceptable without penalty.

Typical values: [0.01, 0.1, 0.5, 1]
kernel: Specifies the type of kernel function to use.

Options: ['linear', 'poly', 'rbf', 'sigmoid']
degree: Degree of the polynomial kernel function (only applicable for poly kernel).

Typical values: [2, 3, 4]
gamma: Defines the influence of a single training example.

Options: ['scale', 'auto'] or specific values like [0.1, 0.01, 0.001] (used with rbf, poly, sigmoid kernels).

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

svr = SVR()

param_grid = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 0.5],
    'kernel': ['linear', 'poly', 'rbf'],
    'gamma': ['scale', 'auto']
    #'degree': [2, 3, 4]  # Used only for 'poly'
}

grid_search = GridSearchCV(estimator=svr, param_grid=param_grid,  scoring='accuracy', cv=10)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)


models = {
    'Ridge Regression': Ridge(),

    'Lasso Regression': Lasso(),

    'DecisionTreeRegressor': DecisionTreeRegressor(),

    'SVR': SVR(kernel),
}

In [None]:
from sklearn.metrics import mean_squared_error
models = {

    'Linear Regression': LinearRegression(),

    'Ridge Regression': Ridge(alpha=1.0),

    'Lasso Regression': Lasso(alpha=0.1),

    'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42),

    'SVR': SVR(kernel='rbf'),
}



results = []



for model_name, model in models.items():

    model.fit(X_train, y_train)



    train_score = model.score(X_train, y_train)  

    test_score = model.score(X_test, y_test)  

    

    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)

    

    results.append({

        'Model': model_name,

        'Train R² Score': train_score,

        'Test R² Score': test_score,

        'Mean Squared Error': mse

    })



results_df = pd.DataFrame(results)



print(results_df)