#verilerin linner regresyon ile tahmin edilmesi

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [6]:
df = pd.read_csv('data_cleaned.csv')

In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2141 entries, 0 to 2140
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   marketing-airlines  2141 non-null   object
 1   departure airport   2141 non-null   object
 2   arrival airport     2141 non-null   object
 3   luggage-unit        2141 non-null   object
 4   flight type         2141 non-null   object
 5   price               2141 non-null   int64 
 6   time                2141 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 117.2+ KB
None


In [8]:
df['marketing-airlines'] = df['marketing-airlines'].astype('category')
df['departure airport'] = df ['departure airport'].astype('category')
df['arrival airport'] = df ['arrival airport'].astype('category')
df['luggage-unit'] = df['luggage-unit '].astype('category')
df['flight type'] = df['flight type'].astype('category')
df['price'] = df['price'].astype('int')
df['time'] = df['time'].astype('int')

In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2141 entries, 0 to 2140
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   marketing-airlines  2141 non-null   category
 1   departure airport   2141 non-null   category
 2   arrival airport     2141 non-null   category
 3   luggage-unit        2141 non-null   object  
 4   flight type         2141 non-null   category
 5   price               2141 non-null   int64   
 6   time                2141 non-null   int64   
 7   luggage-unit        2141 non-null   category
dtypes: category(5), int64(2), object(1)
memory usage: 64.0+ KB
None


In [10]:
categorical_features = ['marketing-airlines', 'departure airport', 'arrival airport','luggage-unit','flight type']
numerical_features = ['time']

In [11]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [12]:
X = df.drop('price', axis=1)
y = df['price']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #, random_state=0

In [32]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', LinearRegression())
])

In [33]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preparation', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [34]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [35]:
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 26062061.32501024
RMSE: 5105.1014999714
R^2: 0.3139529877232474


In [43]:
feature_importances = model.named_steps['model'].coef_
print(len(feature_importances))
print(feature_importances)

71
[-6.00291789e+02 -1.37460614e+03  1.48599639e+03 -9.55720237e+02
  4.29911568e+02 -2.35141313e+03 -1.64038125e+03  2.63782196e+03
 -4.04644782e+03  2.46149176e+03 -5.20258866e+02 -6.19884140e+02
  3.52099679e+01 -4.23227668e+02 -1.38645481e+03 -6.27627344e+02
 -1.59335395e+03  1.18235265e+03  6.34611890e+03  3.02955862e+03
 -7.03293439e+02 -1.51108083e+03 -7.82650541e+02  4.33179586e+03
  3.68362430e+02 -2.74460134e+03 -1.02761860e+03 -9.88479690e+02
 -8.68319696e+02  1.88358948e+03 -9.92527620e+02 -5.20258866e+02
  1.48599639e+03  1.48599639e+03 -1.38645481e+03  2.49502850e+03
 -5.57247953e+02 -1.64038125e+03  3.25746903e+03  2.46149176e+03
 -9.78656811e+02 -7.03293439e+02 -2.94475058e+03 -1.37460614e+03
  7.43236546e+02 -5.35834931e+01 -6.19884140e+02 -7.08026578e+02
 -5.28734226e+03  1.18235265e+03  4.29911568e+02 -6.27627344e+02
 -1.53977046e+03 -8.50430445e+03  9.61913812e+03 -5.53099353e+02
  5.05684974e+03 -9.55720237e+02  3.68362430e+02  2.63782196e+03
 -5.20258866e+02 -7.82

In [44]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

Numerical Features
time -600.2917889623756


In [45]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

Categorical Features
Aer Lingus -1374.6061355247248
Air Algerie 1485.996389990532
Air Baltic -955.7202370962816
Air Europa 429.911568497124
Air France -2351.4131342999635
Air Serbia -1640.381250085099
Austrian 2637.8219611819245
British Airways -4046.447822757053
Brussels Airlines 2461.4917588071576
Eurostar -520.2588661744977
Eurowings -619.8841404454226
ITA - Italia Trasporto Aereo 35.20996794397558
Iberia -423.22766831103115
KLM Royal Dutch Airlines -1386.4548129377922
KM Malta Airlines -627.6273443895266
Lufthansa -1593.3539541466912
Luxair 1182.3526535635876
Norwegian Air International 6346.1188970144985
Norwegian Air Shuttle 3029.558624495898
Royal Air Maroc -703.2934390702683
Scandinavian Airlines -1511.0808295443305
Swiss -782.6505410833611
TAP Portugal 4331.795864507505
Tunisair 368.3624297524747
Vueling Airlines -2744.601338407463
easyJet -1027.6186014845557
LCY -1374.6061355247248
LGW 1485.996389990532
LHR -955.7202370962816
LTN 429.911568497124
QQS -2351.4131342999635
STN -

In [77]:

new_data = pd.DataFrame({
    'marketing-airlines': ['British Airways'],
    'departure airport': ['LGW'],
    'arrival airport': ['CDG'],
    'flight type': ['Direkt Uçuş'],
    'luggage-unit': ['1x23 kg'],
    'time': [200]

})

print(model.predict(new_data))

[5829.09810463]


In [78]:
print(df[(df['marketing-airlines'] == 'British Airways') & (df['flight type'] == 'Direkt Uçuş') & (df['departure airport'] == 'LGW')])

     marketing-airlines departure airport arrival airport  flight type  price  \
8       British Airways               LGW             ORY  Direkt Uçuş   7572   
32      British Airways               LGW             ORY  Direkt Uçuş  11943   
142     British Airways               LGW             ORY  Direkt Uçuş   7572   
272     British Airways               LGW             ORY  Direkt Uçuş   5372   
273     British Airways               LGW             ORY  Direkt Uçuş   5372   
391     British Airways               LGW             ORY  Direkt Uçuş   4222   
402     British Airways               LGW             ORY  Direkt Uçuş   5372   
403     British Airways               LGW             ORY  Direkt Uçuş   5372   
519     British Airways               LGW             ORY  Direkt Uçuş   4539   
526     British Airways               LGW             ORY  Direkt Uçuş   5372   
641     British Airways               LGW             ORY  Direkt Uçuş   5372   
642     British Airways     