In [1]:
import pandas as pd  
from sklearn.impute import SimpleImputer
import numpy as np


In [2]:
df = pd.read_csv('flights.csv')


df = df[['dep_time','sched_dep_time','dep_delay','arr_time','sched_arr_time','arr_delay','air_time','distance','hour','minute','month','day','carrier']]
df.head(10)

Unnamed: 0,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,air_time,distance,hour,minute,month,day,carrier
0,517.0,515,2.0,830.0,819,11.0,227.0,1400,5,15,1,1,UA
1,533.0,529,4.0,850.0,830,20.0,227.0,1416,5,29,1,1,UA
2,542.0,540,2.0,923.0,850,33.0,160.0,1089,5,40,1,1,AA
3,544.0,545,-1.0,1004.0,1022,-18.0,183.0,1576,5,45,1,1,B6
4,554.0,600,-6.0,812.0,837,-25.0,116.0,762,6,0,1,1,DL
5,554.0,558,-4.0,740.0,728,12.0,150.0,719,5,58,1,1,UA
6,555.0,600,-5.0,913.0,854,19.0,158.0,1065,6,0,1,1,B6
7,557.0,600,-3.0,709.0,723,-14.0,53.0,229,6,0,1,1,EV
8,557.0,600,-3.0,838.0,846,-8.0,140.0,944,6,0,1,1,B6
9,558.0,600,-2.0,753.0,745,8.0,138.0,733,6,0,1,1,AA


## step 1 : replace nan data with mean 

In [3]:
# print(df.columns[df.isna().any()].tolist()) ------> To view columns with null data


x =  df.iloc[ : , : ].values

xi = SimpleImputer(missing_values=np.nan,strategy='mean')

xi.fit(x[ :  , : -1])

x[ :  , : -1] = xi.transform(x[ :  , : -1])

x

array([[517.0, 515.0, 2.0, ..., 1.0, 1.0, 'UA'],
       [533.0, 529.0, 4.0, ..., 1.0, 1.0, 'UA'],
       [542.0, 540.0, 2.0, ..., 1.0, 1.0, 'AA'],
       ...,
       [1349.1099473093045, 1210.0, 12.639070257304708, ..., 9.0, 30.0,
        'MQ'],
       [1349.1099473093045, 1159.0, 12.639070257304708, ..., 9.0, 30.0,
        'MQ'],
       [1349.1099473093045, 840.0, 12.639070257304708, ..., 9.0, 30.0,
        'MQ']], dtype=object)

In [4]:
import pickle

# We save the null placement data variable in this section so that we don't have problems adding new data and have that calculation for new nan data.

with open('xi.h5','wb') as f :
    pickle.dump(xi,f)
    print('! Saved successfully !')

# And when we need to use it, we call it like this

with open('xi.h5','rb') as f :
    xi_from_read_h5 = pickle.load(f)
    print('! Read from h5 was successful !')



! Saved successfully !
! Read from h5 was successful !


## step 2 : Convert letter columns to numbers [OHE]


In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [6]:
le  = LabelEncoder()

x[ : , -1 ] = le.fit_transform(x[ : , -1 ])

x[0]

array([517.0, 515.0, 2.0, 830.0, 819.0, 11.0, 227.0, 1400.0, 5.0, 15.0,
       1.0, 1.0, 11], dtype=object)

In [7]:
ct = ColumnTransformer([('carrier',OneHotEncoder(),[-1])],remainder='passthrough')
x = ct.fit_transform(x)
x[0]

array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
       0.0, 0.0, 0.0, 517.0, 515.0, 2.0, 830.0, 819.0, 11.0, 227.0,
       1400.0, 5.0, 15.0, 1.0, 1.0], dtype=object)

## step 3 : Data normalization (we put all the data in the same range)

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
ss = StandardScaler()
x = ss.fit_transform(x)

In [10]:
x

array([[-0.24081671, -0.32809228, -0.04609347, ..., -0.58184581,
        -1.62500738, -1.67766769],
       [-0.24081671, -0.32809228, -0.04609347, ...,  0.14351208,
        -1.62500738, -1.67766769],
       [-0.24081671,  3.04792301, -0.04609347, ...,  0.71343614,
        -1.62500738, -1.67766769],
       ...,
       [-0.24081671, -0.32809228, -0.04609347, ..., -0.8409022 ,
         0.71797462,  1.62958997],
       [-0.24081671, -0.32809228, -0.04609347, ...,  1.69785043,
         0.71797462,  1.62958997],
       [-0.24081671, -0.32809228, -0.04609347, ...,  0.71343614,
         0.71797462,  1.62958997]])

# 🌻 خسته نباشید 