## Imports

In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# Read in Data

In [13]:
data= pd.read_csv("/Users/harjappansingh/Documents/College/Junior Year/Data Science/CA 2/flight_details.csv")

In [14]:

data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1480 entries, 0 to 1479
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Date            1480 non-null   object
 1   Flight Name     1480 non-null   object
 2   Stops           1480 non-null   object
 3   Price           1480 non-null   object
 4   Duration        1480 non-null   object
 5   Departure-Time  1480 non-null   object
 6   Arrival-Time    1480 non-null   object
dtypes: object(7)
memory usage: 81.1+ KB


Unnamed: 0,Date,Flight Name,Stops,Price,Duration,Departure-Time,Arrival-Time
0,2024-10-30,"Aer Lingus, Cathay Pacific",2 stops,€893,40h 35m,17:15,20:50+2
1,2024-10-30,"Aer Lingus, Cathay Pacific",2 stops,€893,40h 35m,17:15,20:50+2
2,2024-10-30,SWISS,2 stops,"€1,322",25h 35m,18:40,07:15+2
3,2024-10-30,Qantas Airways,1 stop,"€1,839",23h 10m,20:50,07:00+2
4,2024-10-30,Air France,2 stops,"€1,163",33h 55m,16:40,13:35+2


In [15]:
# Step 1: convert the 'Date' column to string format if needed
# not strictly necessary as it's already an object, but good to ensure consistency
data['Date'] = data['Date'].astype(str)
print(data['Date'].head())  # checking the first few values to confirm

0    2024-10-30
1    2024-10-30
2    2024-10-30
3    2024-10-30
4    2024-10-30
Name: Date, dtype: object


In [16]:
# Step 2: create a new column 'Airline' with only the first airline in 'Flight Name'
# check if the first airline is in the given list; if not, label it as 'Other Airlines'
top_airlines = ['Qatar', 'Air France', 'British Airways', 'Emirates', 
                'Etihad Airways', 'Finnair', 'KLM', 'Turkish Airlines', 
                'Qantas Airways', 'SWISS']

# split flight name and check the first airline
data['Airline'] = data['Flight Name'].apply(lambda x: x.split(',')[0].strip())
data['Airline'] = data['Airline'].apply(lambda x: x if x in top_airlines else 'Other Airlines')
print(data['Airline'].value_counts())  # checking the distribution of airlines


Airline
Other Airlines      378
Qantas Airways      190
KLM                 189
Emirates            182
British Airways     114
Turkish Airlines    112
Air France          104
Etihad Airways       83
Finnair              66
SWISS                62
Name: count, dtype: int64


In [17]:
# Step 3: convert 'Stops' from "x stops" to an integer value
# if it says "non-stop," assign 0
data['Stops'] = data['Stops'].apply(lambda x: 0 if x.lower() == 'non-stop' else int(x.split()[0]))
print(data['Stops'].value_counts())  # checking the distribution of stops


Stops
2    922
1    368
3    190
Name: count, dtype: int64


In [18]:
# Step 4: remove special characters in 'Price' and convert to float
data['Price'] = data['Price'].str.replace('€', '').str.replace(',', '').astype(float)
print(data['Price'].head())  # checking the first few values


0     893.0
1     893.0
2    1322.0
3    1839.0
4    1163.0
Name: Price, dtype: float64


In [19]:
# Step 5: convert 'Duration' to total hours with only two decimal places
# splitting by 'h' and 'm', calculating the total time in hours, and rounding to 2 decimals
def duration_to_hours(duration):
    hours = int(duration.split('h')[0].strip())
    minutes = int(duration.split('h')[1].split('m')[0].strip())
    return round(hours + minutes / 60, 2)

data['Duration'] = data['Duration'].apply(duration_to_hours)
print(data['Duration'].head())  # checking the converted durations with 2 decimals


0    40.58
1    40.58
2    25.58
3    23.17
4    33.92
Name: Duration, dtype: float64


In [20]:
# Step 6: keep 'Departure-Time' and drop 'Arrival-Time' as it can be derived
data.drop(columns=['Arrival-Time'], inplace=True)
print(data.columns)  # checking the remaining columns


Index(['Date', 'Flight Name', 'Stops', 'Price', 'Duration', 'Departure-Time',
       'Airline'],
      dtype='object')


In [22]:
# Step 7: drop the 'Flight Name' column as it's no longer needed
data.drop(columns=['Flight Name'], inplace=True)
print(data.columns)  # checking the remaining columns


Index(['Date', 'Stops', 'Price', 'Duration', 'Departure-Time', 'Airline'], dtype='object')


In [21]:
# checking the cleaned dataset
print(data.info())
print(data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1480 entries, 0 to 1479
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            1480 non-null   object 
 1   Flight Name     1480 non-null   object 
 2   Stops           1480 non-null   int64  
 3   Price           1480 non-null   float64
 4   Duration        1480 non-null   float64
 5   Departure-Time  1480 non-null   object 
 6   Airline         1480 non-null   object 
dtypes: float64(2), int64(1), object(4)
memory usage: 81.1+ KB
None
         Date                 Flight Name  Stops   Price  Duration  \
0  2024-10-30  Aer Lingus, Cathay Pacific      2   893.0     40.58   
1  2024-10-30  Aer Lingus, Cathay Pacific      2   893.0     40.58   
2  2024-10-30                       SWISS      2  1322.0     25.58   
3  2024-10-30              Qantas Airways      1  1839.0     23.17   
4  2024-10-30                  Air France      2  1163.0     33.92   

