In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import os


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

from sklearn.linear_model import Ridge

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold

In [32]:
"""
Chargement des données:
"""
sample_df = pd.read_csv('sample_submission.csv')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.head()

Unnamed: 0,id,brand,model,car_class,range,fuel_type,hybrid,max_power,grbx_type_ratios,weight_min,weight_max,urb_cons,exturb_cons,overall_cons,co,hc,nox,hcnox,ptcl,co2
0,0,MERCEDES,COMBI 110 CDI,MINIBUS,MOY-INFER,GO,non,70.0,M 6,1976,2075,9.1,6.4,7.4,0.083,,0.229,0.25,0.001,195
1,1,MERCEDES,VIANO 2.0 CDI,MINIBUS,MOY-SUPER,GO,non,100.0,A 5,2186,2355,10.2,7.0,8.2,0.078,,0.224,0.233,0.001,216
2,2,MERCEDES,SPRINTER COMBI 319 CDI,MINIBUS,MOY-INFER,GO,non,140.0,A 5,2586,2869,12.5,9.0,10.3,0.067,0.014,1.846,,0.002,272
3,3,RENAULT,MEGANE Coupé EnergyTCe (115ch) eco2,COUPE,MOY-INFER,ES,non,85.0,M 6,1280,1280,6.4,4.6,5.3,0.167,0.039,0.039,,0.001,119
4,4,MERCEDES,COMBI 116 CDI,MINIBUS,MOY-INFER,GO,non,120.0,A 5,2356,2450,10.1,6.9,8.1,0.042,,0.19,0.201,0.001,214


In [33]:
"""
Gestion des nan:
"""

# Identification des colonnes avec au moins un NaN dans train_df
colonnes_avec_nan = test_df.columns[test_df.isna().any()].tolist()
train_df[colonnes_avec_nan] = train_df[colonnes_avec_nan].fillna(train_df[colonnes_avec_nan].mean())
test_df[colonnes_avec_nan] = test_df[colonnes_avec_nan].fillna(test_df[colonnes_avec_nan].mean())


"""
Probleme avec les strings (LabelEncoder):
"""


colonnes_string = ['brand', 'model', 'car_class', 'fuel_type', 'hybrid', 'grbx_type_ratios', 'range']

# Conversion des colonnes en type 'category'
for colonne in colonnes_string:
    le = LabelEncoder()
    le.fit(pd.concat([train_df[colonne], test_df[colonne]]))
    train_df[colonne] = le.transform(train_df[colonne])
    test_df[colonne] = le.transform(test_df[colonne])
    
    
"""
Normalisation des données:
"""
numeric_colons = ['max_power', 'weight_min', 'weight_max', 'urb_cons', 'exturb_cons', 'overall_cons', 'co', 'nox', 'ptcl']
scaler = StandardScaler()
train_df[numeric_colons] = scaler.fit_transform(train_df[numeric_colons])
test_df[numeric_colons] = scaler.transform(test_df[numeric_colons])


train_df.info()
print("\n\n")
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41257 entries, 0 to 41256
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                41257 non-null  int64  
 1   brand             41257 non-null  int64  
 2   model             41257 non-null  int64  
 3   car_class         41257 non-null  int64  
 4   range             41257 non-null  int64  
 5   fuel_type         41257 non-null  int64  
 6   hybrid            41257 non-null  int64  
 7   max_power         41257 non-null  float64
 8   grbx_type_ratios  41257 non-null  int64  
 9   weight_min        41257 non-null  float64
 10  weight_max        41257 non-null  float64
 11  urb_cons          41257 non-null  float64
 12  exturb_cons       41257 non-null  float64
 13  overall_cons      41257 non-null  float64
 14  co                41257 non-null  float64
 15  hc                41257 non-null  float64
 16  nox               41257 non-null  float6

In [34]:
""" 
Jeu de données:
"""
X_train = train_df[[col for col in train_df.columns if col != 'co2' and col != 'id']]
Y_train = train_df['co2']

X_test = test_df[[col for col in test_df.columns]]

In [35]:


"""
Régressions:
"""

# model = LinearRegression().fit(X_train, Y_train)
# model = Ridge(alpha=1.0).fit(X_train, Y_train)
model = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_train, Y_train)

"""
Affichage des résultats:
"""
print("Score training:", model.score(X_train, Y_train))
# print("Coefficient du modèle: ", model.coef_)
# print("Bias du modèle: ", model.intercept_)

Score training: 0.9998635597763816


In [36]:
"""
Prédiction:
"""
print(len(train_df))
X_test['result'] = model.predict(X_test.drop(columns=['id']))

print("Prédiction:")
resultat = [(X_test['id'], X_test['result']) for i in range(len(X_test))]
with open('resultat.csv', 'w') as f:
    f.write("id,co2\n")
    for id, co2 in resultat:
        f.write(f"{id},{co2}\n")


41257
Prédiction:
