In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
import math

In [2]:
#Seleccionar todos los datos de 'train'
array_df = [pd.read_csv('Data/train/data_0'+ str(csvnum) +'.csv') for csvnum in np.arange(1,15+1)]

In [3]:
for df in array_df:
    #Se cambia el nombre de las columnas para que tengan un formato acorde al de Python
    df.columns = ["date" , "ambient_temperature" , "ambient_pressure" , "compressor_speed" , "compressor_discharge_pressure" , "gas_generator_discharge_pressure" , "high _pressure_turbine_inlet_temperature" , "compressor_discharge_temperature" , "low_pressure_turbine_inlet_temperature" , "exhaust_temperature" , "ambient_relative_humidity" , "water_to_dry_air_ratio" , "power"]

In [4]:
# longitud de cada dataframe
array_len_df = [len(df) for df in array_df]

In [5]:
#nuevo index en el dataframe de los datos 
for csvnum in np.arange(1,15+1):
    array_df[csvnum-1].set_axis(np.arange(sum(array_len_df[:csvnum-1]), sum(array_len_df[:csvnum])), axis='index', inplace=True)

In [6]:
#unir todos los datos
df_train = pd.concat(array_df) 

In [7]:
#quitar valores vacios en entrenamiento
df_train = df_train.dropna()

In [8]:
#poner en 0 los valores vacios en entrenamiento
df_train = df_train.fillna(0)

In [9]:
len_train = len(df_train)
#len_val = len(df_val)

In [10]:
#Clase para mi modelo, con un metodo para entrenamiento y otro de prediccion
#los hyperparametros son la línea de separación para los datos de entrenamiento
# (linear_cut_slope, linear_cut_intercept) y la k del k-Means (knum)

class MyModel():
    # default constructor
    def __init__(self, linear_cut_slope = 1640, linear_cut_intercept = 0, knum = 3):
        self.linear_cut_slope = linear_cut_slope
        self.linear_cut_intercept = linear_cut_intercept
        self.knum = knum
        
    # a method for printing data members
    def train(self, df_tr):
        self.df_tr = df_tr #dataframe for training
        
        # Linear Regression EXH_T vs POWER
        X_tr = np.array(df_tr['exhaust_temperature']).reshape(-1,1)
        Y_tr = np.array(df_tr['power']).reshape(-1,1)

        self.orange_model = LinearRegression().fit(X_tr, Y_tr)

        # KNeighbors data election with CDP and CMP_SPEED
        self.bluemask = df_tr['power'] < (self.linear_cut_slope)*df_tr['compressor_discharge_pressure'] + (self.linear_cut_intercept)
        df_X_kmeans = df_tr[['compressor_discharge_pressure', 'compressor_speed']].to_numpy()
        df_Y_kmeans = self.bluemask #np.array([0 for i in df_X_kmeans])
        
        self.chooseblue = KNeighborsClassifier(self.knum)
        self.chooseblue.fit(df_X_kmeans, df_Y_kmeans)
        
        # Linear Regression CDP vs POWER
        X_tr = np.array(df_tr['compressor_discharge_pressure']).reshape(-1,1)
        Y_tr = np.array(df_tr['power']).reshape(-1,1)
        
        self.blue_model = LinearRegression().fit(X_tr[self.bluemask], Y_tr[self.bluemask])

    def predict(self, df_pr): #dataframe for predicting
        
        # Predict with Linear Regression EXH_T vs POWER
        X_prt = np.array(df_pr['exhaust_temperature']).reshape(-1,1)
        Y_prt = self.orange_model.predict(X_prt)

        # Predict with Linear Regression CDP vs POWER ()
        X_prt1 = np.array(df_pr['compressor_discharge_pressure']).reshape(-1,1)
        Y_prt1 = self.blue_model.predict(X_prt1) 
        
        # Blue or orange prediction with KNeighbors data
        self.bluelabel = self.chooseblue.predict(df_pr[['compressor_discharge_pressure', 'compressor_speed']].to_numpy())

        # change prediction if data is blue type with bluelabel
        for i in np.arange(len(Y_prt)):
            if (self.bluelabel[i] == True):
                Y_prt[i] = Y_prt1[i]
                
        # Take into account that test_data_123.csv, 
        # df_pr['exhaust_temperature'] could be zero
        for i in np.arange(len(Y_prt)):
            if ((df_pr['exhaust_temperature']==0)[i]==True):
                Y_prt[i] = 0

        return Y_prt

# Predicciones en Kaggle

In [11]:
Modelo1 = MyModel()

In [12]:
Modelo1.train(df_train)

In [13]:
# Dataframe por predecir
df_pred = pd.read_csv ('Data/test/test_data_123.csv')

In [14]:
#df_pred

In [15]:
# Se cambia el nombre de las columnas para que tengan un formato acorde al de Python
df_pred.columns = ["date" , "ambient_temperature" , "ambient_pressure" , "compressor_speed" , "compressor_discharge_pressure" , "gas_generator_discharge_pressure" , "high _pressure_turbine_inlet_temperature" , "compressor_discharge_temperature" , "low_pressure_turbine_inlet_temperature" , "exhaust_temperature" , "ambient_relative_humidity" , "water_to_dry_air_ratio" ]

In [16]:
# Colocar valores nulos en cero
df_pred = df_pred.fillna(0)

In [17]:
# Prediccion de la potencia
y_val_pred = Modelo1.predict(df_pred)

In [18]:
# Agregar como nueva columna al dataframe original (test_data_123.csv)
df_pred["power_pred"] = y_val_pred

In [19]:
# Prediccion como numpy array
y_val_pred

array([[ 8412.65777622],
       [ 8420.03497684],
       [ 8082.8428351 ],
       [ 8185.18965047],
       [ 6980.06588073],
       [    0.        ],
       [17145.13312055],
       [17942.85945965],
       [17596.00204954],
       [17251.45301281],
       [17400.04139878],
       [17893.90109375],
       [18077.00972554],
       [17895.63618714],
       [17684.4103831 ],
       [17330.2061279 ],
       [17101.97207248],
       [17243.15583629],
       [16950.3077832 ],
       [16387.50964197],
       [15922.27297519],
       [15892.41270487],
       [15864.2489731 ],
       [15594.71005605],
       [15693.23718128],
       [15978.60787331],
       [15977.87517037],
       [15990.44170517],
       [15880.7743763 ],
       [15813.3896398 ],
       [15726.41237781],
       [15223.52066365],
       [15094.25284795],
       [15135.02885977],
       [14968.21395487],
       [15286.53684372],
       [14175.18772095],
       [    0.        ],
       [12087.48929039],
       [    0.        ],


In [20]:
# Seleccionar columnas importantes
df_pred_output = df_pred[['date','power_pred']]

In [21]:
# Cambiar de nombre
df_pred_output.columns = ["date" , "POWER"]

In [22]:
# df_pred_output

In [23]:
# Cambiar el tipo de la variable 'date' a tipo fecha
df_pred_output['date'] = pd.to_datetime(df_pred_output['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred_output['date'] = pd.to_datetime(df_pred_output['date'])


In [24]:
#df_pred_output['date']

In [25]:
# Modificar la columna de 'date' para que tenga el formato correcto de output
df_pred_output['date'] = df_pred_output['date'].dt.strftime('%-m/%-d/%Y')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred_output['date'] = df_pred_output['date'].dt.strftime('%-m/%-d/%Y')


In [26]:
#df_pred_output.to_csv(index=False)

In [27]:
#Guardar predicciones como submission.csv en esta carpeta
df_pred_output.to_csv('submission.csv', index = False)  

In [28]:
#df_pred_output