In [27]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler


# Load the dataset

In [31]:
file_path = '/content/cleaned_data.csv'
data_cleaned = pd.read_csv(file_path)

# Display original column names to identify the columns to rename

In [33]:
original_column_names = data_cleaned.columns.tolist()
print("Original Column Names:\n", original_column_names)

Original Column Names:
 ['Route', 'Departing Port', 'Arriving Port', 'Airline', 'Month', 'Sectors Scheduled', 'Sectors Flown', 'Cancellations', 'Departures On Time', 'Arrivals On Time', 'Departures Delayed', 'Arrivals Delayed', 'OnTime Departures \n(%)', 'OnTime Arrivals \n(%)', 'Cancellations \n\n(%)']


# Rename columns

In [34]:
data_cleaned.rename(columns={
    'ontime_departures_\n(%)': 'ontime_departures_(%)',
    'ontime_arrivals_\n(%)': 'ontime_arrivals_(%)',
    'cancellations_\n\n(%)': 'cancellations_(%)'
}, inplace=True)


# Display updated column names to verify renaming

In [35]:
updated_column_names = data_cleaned.columns.tolist()
print("Updated Column Names:\n", updated_column_names)

Updated Column Names:
 ['Route', 'Departing Port', 'Arriving Port', 'Airline', 'Month', 'Sectors Scheduled', 'Sectors Flown', 'Cancellations', 'Departures On Time', 'Arrivals On Time', 'Departures Delayed', 'Arrivals Delayed', 'OnTime Departures \n(%)', 'OnTime Arrivals \n(%)', 'Cancellations \n\n(%)']


# Identify missing values

In [36]:
missing_values = data_cleaned.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 Route                      0
Departing Port             0
Arriving Port              0
Airline                    0
Month                      0
Sectors Scheduled          0
Sectors Flown              0
Cancellations              0
Departures On Time         0
Arrivals On Time           0
Departures Delayed         0
Arrivals Delayed           0
OnTime Departures \n(%)    0
OnTime Arrivals \n(%)      0
Cancellations \n\n(%)      0
dtype: int64


# Fill missing values

In [37]:
for column in data_cleaned.columns:
    if data_cleaned[column].dtype == 'object':
        data_cleaned[column].fillna(data_cleaned[column].mode()[0], inplace=True)
    else:
        data_cleaned[column].fillna(data_cleaned[column].median(), inplace=True)


# Remove rows or columns with significant missing data if necessary

In [38]:
threshold = 0.6  # Remove columns with more than 60% missing values
data_cleaned.dropna(thresh=int(threshold * len(data_cleaned)), axis=1, inplace=True)
data_cleaned.dropna(inplace=True)


In [39]:
# Identify and remove duplicate rows

In [40]:
duplicates = data_cleaned.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")
data_cleaned = data_cleaned.drop_duplicates()

Number of duplicate rows: 0


# Using Z-score to detect outliers

In [41]:
z_scores = np.abs(stats.zscore(data_cleaned.select_dtypes(include=[np.number])))
outliers = (z_scores > 3).sum(axis=1)
print(f"Number of outliers: {(outliers > 0).sum()}")

Number of outliers: 783


# remove outliers

In [42]:
data_cleaned = data_cleaned[(z_scores < 3).all(axis=1)]

# Standardize data

In [43]:
scaler = StandardScaler()
data_standardized = pd.DataFrame(scaler.fit_transform(data_cleaned.select_dtypes(include=[np.number])), columns=data_cleaned.select_dtypes(include=[np.number]).columns)


# Normalize data

In [44]:
scaler = MinMaxScaler()
data_normalized = pd.DataFrame(scaler.fit_transform(data_cleaned.select_dtypes(include=[np.number])), columns=data_cleaned.select_dtypes(include=[np.number]).columns)

# Convert columns to appropriate data types

In [46]:
data_cleaned['Month'] = pd.to_datetime(data_cleaned['Month'], errors='coerce')


# Ensure categorical consistency

In [48]:
data_cleaned['Route'] = data_cleaned['Route'].str.strip().str.lower()

# Encoding categorical data

In [50]:
data_encoded = pd.get_dummies(data_cleaned, columns=['Route', 'Airline'], drop_first=True)


# Apply consistent formatting

In [51]:
data_cleaned.columns = data_cleaned.columns.str.strip().str.lower().str.replace(' ', '_')


# Display column names to identify irrelevant columns

In [52]:
column_names = data_cleaned.columns.tolist()
print("Column Names:\n", column_names)


Column Names:
 ['route', 'departing_port', 'arriving_port', 'airline', 'month', 'sectors_scheduled', 'sectors_flown', 'cancellations', 'departures_on_time', 'arrivals_on_time', 'departures_delayed', 'arrivals_delayed', 'ontime_departures_\n(%)', 'ontime_arrivals_\n(%)', 'cancellations_\n\n(%)']


# Aggregate data

In [53]:
data_aggregated = data_cleaned.groupby('month').agg({
    'sectors_flown': 'sum',
    'cancellations': 'sum',
    'departures_on_time': 'mean',
    'arrivals_on_time': 'mean'
}).reset_index()


# Transform data

In [54]:
data_aggregated['log_sectors_flown'] = np.log1p(data_aggregated['sectors_flown'])

# Create new features

In [55]:
data_cleaned['on_time_ratio'] = data_cleaned['departures_on_time'] / data_cleaned['arrivals_on_time']

# Automate checks

In [56]:
def validate_data(df):
    assert df.isnull().sum().sum() == 0, "There are missing values"
    assert df

#Downloading the cleaned dataset

In [67]:
import pandas as pd

# Save the cleaned and renamed dataset to a new CSV file
data_cleaned.to_csv('cleaned_data_final.csv', index=False)

from google.colab import files

# Download the file
files.download('cleaned_data_final.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>