In [69]:
#################################################################################
###           0O2_02_Stats_Model_DimReduced
###
##################################################################################

import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.linear_model import Ridge, RidgeCV, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import confusion_matrix

# Read the input files
Cleanfile = r"/Users/livalacaisse/Documents/DataScience/CO2/000-C02 First Delivery/Cleaned_countries/FR_Cleaned.csv"
SelectedVars = "ReductionsByEnergy.csv"
dtype_spec = {
    'Em_on_target': 'int64',
    'Fuel consumption': 'float32',
    'Engine_cm3': 'float32',
    'Kg_veh': 'float32',
    'Test_mass': 'float32',
    'Power_KW': 'float32',
    'El_Consumpt_whkm': 'float32',
    'Energy': 'category',
    'Fuel_mode': 'category',
    'Brand': 'category',
    'Veh_type': 'category',
    'Veh_Model': 'category',
    'Version': 'category',
    'Veh_Category': 'category',
    'year': 'int64',
    'Country': 'category'
}


df_Clean = pd.read_csv(Cleanfile,dtype=dtype_spec, low_memory=False)
df_Clean.drop_duplicates(inplace=True)
df_SelectedVars = pd.read_csv(SelectedVars)

# Inspect the columns to ensure 'Energy' exists
print("Columns in df_Clean:", df_Clean.columns)

# Check if 'Energy' column exists
if 'Energy' in df_Clean.columns:
    ToDelete = df_Clean.loc[(df_Clean["Energy"] == "electric")]
    df_Clean = df_Clean.drop(ToDelete.index)

    ToDelete = df_Clean.loc[(df_Clean["Energy"] == "hydrogen")]
    df_Clean = df_Clean.drop(ToDelete.index)
else:
    print("The 'Energy' column does not exist in the DataFrame")

# Continue with the rest of the operations
#df_Clean = df_Clean.drop("Em_on_target", axis=1)
#df_Clean = df_Clean.drop("CO2_Qtls", axis=1)
df_Clean = df_Clean.drop("Unnamed: 0", axis=1)

Target_Var = "CO2_wltp"
NumVar = 12
Alpha = 0.5
Splits = 5
V_Country = "ALL_Countries"
model = Ridge(alpha=Alpha)
Redct_Type = "PCA"



In [79]:
display(df_Clean.info())

None

In [70]:
df_Clean.head()

Unnamed: 0,Country,Constructor,Veh_type,Version,Brand,Veh_Model,Veh_Category,Kg_veh,Test_mass,CO2_wltp,...,Fuel_mode,Engine_cm3,Power_KW,El_Consumpt_whkm,Erwltp (g/km),year,Fuel consumption,Electric range (km),Eco-innovation program,Em_on_target
0,FR,FCA ITALY SPA,BU,09A,JEEP,RENEGADE,M1,1465.0,1620.549438,147.828783,...,M,1598.0,88.0,0.0,0.0,2018,3.973137,0.0,0,0
1,FR,FCA ITALY SPA,BU,02A,JEEP,RENEGADE,M1,1395.0,1620.549438,159.142421,...,M,1368.0,103.0,0.0,0.0,2018,3.973137,0.0,0,0
2,FR,FCA ITALY SPA,BU,02A,JEEP,RENEGADE,M1,1395.0,1620.549438,159.142421,...,M,1368.0,103.0,0.0,0.0,2018,3.973137,0.0,0,0
3,FR,FCA ITALY SPA,BU,09A,JEEP,RENEGADE,M1,1465.0,1620.549438,147.828783,...,M,1598.0,88.0,0.0,0.0,2018,3.973137,0.0,0,0
4,FR,FCA ITALY SPA,BU,09A,JEEP,RENEGADE,M1,1465.0,1620.549438,147.828783,...,M,1598.0,88.0,0.0,0.0,2018,3.973137,0.0,0,0


In [81]:
#--  prepare file To store results 

import sys
sys.stdout = open('Ridge_results_Scalated'+Target_Var+'.txt', 'w')

result_df = pd.DataFrame(columns=["Reduction_type", "TargetVar", "Energy", "R2","RMSE"])
Energies = df_SelectedVars["Energy"].unique()
df_SelectedVars = df_SelectedVars[df_SelectedVars['Reduction_type'] == Redct_Type]

for Ene in Energies:
    
#-- Select lines related to the corresponding energy. 
    df_energy = df_Clean[df_Clean['Energy'] == Ene]
    
#-- Select vars needed for training model 
    y_target = df_energy[Target_Var]    
    df_features = df_SelectedVars[(df_SelectedVars['Energy'] == Ene ) & (df_SelectedVars['Reduction_type'] == Redct_Type)]    
    Top_features  = df_features["Selected_vars"].unique()      
    X_features = df_energy[Top_features].copy() 
    X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2)
     
#    scaler = StandardScaler()    
#    X_scaled = scaler.fit_transform(X_features)   
#    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_target, test_size=0.2)

#-- Train and predict results
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)   

    results = pd.DataFrame({'Predict': predictions, 'Real values': y_test})
   
    print(" ****************************************** ")
    print(" Ridge Regression "+  Ene )

    r2 = r2_score(y_test, predictions)
    print(" R² score:", r2)
    
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    print(" RMSE:", rmse)
    print(" ****************************************** ")
    
    result_df = pd.concat([result_df, pd.DataFrame({'Reduction_type': [Redct_Type], 'TargetVar': [Target_Var], 'R2': [r2], 'Energy': [Ene], 'RMSE': [rmse]})], ignore_index=True)


  result_df = pd.concat([result_df, pd.DataFrame({'Reduction_type': [Redct_Type], 'TargetVar': [Target_Var], 'R2': [r2], 'Energy': [Ene], 'RMSE': [rmse]})], ignore_index=True)


In [None]:
print(result_df.head(25))
print(results.head(25))

sys.stdout.close()

import winsound
winsound.PlaySound("C:\\Users\\mied1\\00 - Herramientas Python\\mixkit-intro-transition-1146.wav", winsound.SND_FILENAME)
 
display ("\n End")
break

# 