In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn import metrics
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis

In [2]:
file_path = 'train.csv' #cambiar por la ruta en donde tengan el archivo entrenamiento
file_path2 = 'test.csv' 

df = pd.read_csv(file_path)
df2 = pd.read_csv(file_path2)

In [3]:
#Chequear si existen datos faltantes/en blanco
print(df.isnull().sum())
print(df2.isnull().sum())

V1         304
V2           0
V3           0
V4         304
V5         304
V6         304
V7           0
V8         304
V9         304
V10          0
V11        304
V12         37
V13          0
V14         31
V15      10327
Class        0
dtype: int64
V1       96
V2        0
V3        0
V4       96
V5       96
V6       96
V7        0
V8       96
V9       96
V10       0
V11      96
V12      13
V13       0
V14      19
V15    3753
dtype: int64


In [4]:
#Como se pudo ver en el output anterior hay una gran cantidad de valores en blanco/faltantes, por lo que se procederá de la siguiente forma:
#Se rellenarán los datos en blanco con la media del dataset

# Lista de columnas a rellenar
columnas_a_rellenar = ["V1", "V4", "V5", "V6", "V8", "V9", "V11", "V12", "V14"]

# Rellenar valores faltantes con la media en set de entrenamiento
for columna in columnas_a_rellenar:
    df[columna] = df[columna].fillna(df[columna].mean())


# Rellenar valores faltantes con la media en set de pruebas
for columna in columnas_a_rellenar:
    df2[columna] = df2[columna].fillna(df2[columna].mean())



In [5]:
# Verificar valores faltantes
print(df.isnull().sum())

# Verificar tipos de datos
print(df.dtypes)

# Verificar resumen estadístico para identificar outliers
print(df.describe())

V1           0
V2           0
V3           0
V4           0
V5           0
V6           0
V7           0
V8           0
V9           0
V10          0
V11          0
V12          0
V13          0
V14          0
V15      10327
Class        0
dtype: int64
V1       float64
V2       float64
V3       float64
V4       float64
V5       float64
V6       float64
V7       float64
V8       float64
V9       float64
V10      float64
V11      float64
V12      float64
V13      float64
V14      float64
V15      float64
Class      int64
dtype: object
                  V1            V2            V3             V4            V5  \
count   10980.000000  10980.000000  10980.000000   10980.000000  10980.000000   
mean     4329.956531   4009.902705   4263.892771    4182.036731   4341.713772   
std      2910.721372     50.417659     50.275117    6092.905791     39.079867   
min      1030.770000   2830.770000   1040.000000    2453.330000   2089.740000   
25%      4281.030000   3990.770000   4250.260000    4108

In [6]:
#Set de pruebas

# Verificar valores faltantes
print(df2.isnull().sum())

# Verificar tipos de datos
print(df2.dtypes)


# Verificar resumen estadístico para identificar outliers
print(df2.describe())

V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15    3753
dtype: int64
V1     float64
V2     float64
V3     float64
V4     float64
V5     float64
V6     float64
V7     float64
V8     float64
V9     float64
V10    float64
V11    float64
V12    float64
V13    float64
V14    float64
V15    float64
dtype: object
                V1           V2           V3           V4           V5  \
count  4000.000000  4000.000000  4000.000000  4000.000000  4000.000000   
mean   4302.423427  4009.397087  4264.378352  4122.594339  4341.731821   
std      37.456163    30.444417    21.305554    21.055579    17.172061   
min    4198.460000  3908.210000  4199.490000  4058.970000  4309.740000   
25%    4281.540000  3990.770000  4250.260000  4108.210000  4331.790000   
50%    4295.900000  4005.640000  4262.560000  4120.510000  4339.490000   
75%    4312.820000  4023.080000  4271.790000  4132

In [7]:
#removeremos valores nulos
df = df.drop('V15', axis=1)
df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,Class
0,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.10,0
1,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,0
2,4328.72,4011.79,4296.41,4155.90,4343.59,4582.56,4097.44,4630.77,4217.44,4235.38,4210.77,4287.69,4632.31,4396.41,0
3,4326.15,4011.79,4292.31,4151.28,4347.69,4586.67,4095.90,4627.69,4210.77,4244.10,4212.82,4288.21,4632.82,4398.46,0
4,4321.03,4004.62,4284.10,4153.33,4345.64,4587.18,4093.33,4616.92,4202.56,4232.82,4209.74,4281.03,4628.21,4389.74,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10975,4288.21,3995.90,4248.21,4120.00,4334.36,4615.90,4084.62,4641.03,4214.36,4228.72,4178.46,4273.85,4600.00,4343.08,1
10976,4282.56,3991.79,4250.26,4115.90,4332.31,4612.82,4077.44,4639.49,4210.77,4225.64,4175.38,4267.69,4595.90,4340.00,1
10977,4280.51,3988.72,4249.23,4116.92,4332.31,4612.82,4072.31,4632.31,4207.69,4220.00,4173.85,4271.28,4595.38,4343.08,1
10978,4284.62,3991.79,4251.28,4122.05,4334.36,4616.41,4080.51,4628.72,4200.00,4220.00,4165.64,4267.18,4596.41,4350.77,1


In [8]:
df2 = df2.drop('V15', axis=1)
df2

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14
0,4284.10,4020.00,4249.23,4122.05,4335.38,4616.92,4070.77,4618.97,4187.69,4215.90,4190.26,4266.15,4597.44,4342.05
1,4289.74,3969.74,4249.23,4104.10,4336.41,4606.15,4066.15,4591.79,4169.23,4208.72,4188.21,4270.26,4581.54,4350.77
2,4297.44,4015.38,4250.26,4115.38,4326.15,4589.74,4078.97,4603.59,4194.87,4221.54,4186.15,4273.85,4599.49,4361.54
3,4360.51,4055.38,4291.28,4140.51,4340.51,4621.54,4082.05,4615.90,4204.62,4231.79,4222.05,4295.38,4613.33,4429.23
4,4318.46,4006.67,4278.97,4123.59,4342.05,4614.87,4069.74,4624.10,4200.00,4244.62,4203.08,4293.85,4608.21,4375.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,4294.87,3995.38,4259.49,4103.08,4337.95,4633.33,4077.95,4617.44,4213.85,4238.97,4203.08,4273.85,4594.36,4347.69
3996,4300.51,4014.36,4248.21,4126.67,4344.10,4621.03,4080.51,4600.00,4192.82,4251.28,4203.08,4277.95,4607.69,4372.31
3997,4293.85,3995.90,4251.79,4113.33,4330.26,4632.82,4070.26,4624.62,4203.08,4237.95,4215.90,4284.10,4620.51,4367.69
3998,4293.85,4020.00,4274.36,4135.38,4369.74,4650.26,4105.13,4663.59,4237.44,4263.08,4217.95,4285.64,4636.41,4364.10


In [9]:
x_test = df2
x_train = df.drop('Class', axis=1 ) #features
y_train = df['Class'] #target
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(x_train)

# Transformar los datos de prueba con la media y desviación estándar del entrenamiento
X_test_scaled = scaler.transform(x_test)


#naive bayes#######################################
nb = GaussianNB()
skf2 = StratifiedKFold(n_splits=5)
cv2 = cross_validate(nb, X_train_scaled, y_train, cv=skf2, scoring=['f1', 'precision', 'recall'], return_train_score=True)
print("Metricas cross_validation NB(F1-score):", cv2['test_f1'].mean())
print("Metricas cross_validation NB(Precision):", cv2['test_precision'].mean())
print("Metricas cross_validation NB(Recall):", cv2['test_recall'].mean())

nb.fit(X_train_scaled,y_train)

nb_predict = nb.predict(X_test_scaled)
print('##################################################################')
#QDA###############################################
qda = QuadraticDiscriminantAnalysis()
skf3 = StratifiedKFold(n_splits=5)
cv3 = cross_validate(qda, X_train_scaled, y_train, cv=skf3, scoring=['f1', 'precision', 'recall'], return_train_score=True)
print("Metricas cross_validation QDA(F1-score):", cv3['test_f1'].mean())
print("Metricas cross_validation QDA(Precision):", cv3['test_precision'].mean())
print("Metricas cross_validation QDA(Recall):", cv3['test_recall'].mean())

qda.fit(X_train_scaled,y_train)
qda_predict = qda.predict(X_test_scaled)
print('##################################################################')

#LDA##############################################
lda = LinearDiscriminantAnalysis()
skf4 = StratifiedKFold(n_splits=5)
cv4 = cross_validate(lda, X_train_scaled, y_train, cv=skf4, scoring=['f1', 'precision', 'recall'], return_train_score=True)
print("Metricas cross_validation LDA(F1-score):", cv4['test_f1'].mean())
print("Metricas cross_validation LDA(Precision):", cv4['test_precision'].mean())
print("Metricas cross_validation LDA(Recall):", cv4['test_recall'].mean())

lda.fit(X_train_scaled,y_train)
lda_predict = lda.predict(X_test_scaled)
print('##################################################################')


#Logistic regresion################################
lr = LogisticRegression(max_iter=100, class_weight='balanced', C=0.0001, solver='lbfgs')
skf = StratifiedKFold(n_splits=5)

cv = cross_validate(lr, X_train_scaled, y_train, cv=skf, scoring=['f1', 'precision', 'recall'], return_train_score=True)
print("Metricas cross_validation LR(F1-score):", cv['test_f1'].mean())
print("Metricas cross_validation LR(Precision):", cv['test_precision'].mean())
print("Metricas cross_validation LR(Recall):", cv['test_recall'].mean())

lr.fit(X_train_scaled, y_train)
y_val_pred_lr = lr.predict(X_test_scaled)





Metricas cross_validation NB(F1-score): 0.5770247259386616
Metricas cross_validation NB(Precision): 0.4218019964441799
Metricas cross_validation NB(Recall): 0.9153369652945924
##################################################################
Metricas cross_validation QDA(F1-score): 0.5510084141470717
Metricas cross_validation QDA(Precision): 0.41601740510988117
Metricas cross_validation QDA(Recall): 0.8379956954533225
##################################################################
Metricas cross_validation LDA(F1-score): 0.28800021578002977
Metricas cross_validation LDA(Precision): 0.3166925713136314
Metricas cross_validation LDA(Recall): 0.28628822527127606
##################################################################
Metricas cross_validation LR(F1-score): 0.411776817988443
Metricas cross_validation LR(Precision): 0.3875324268591577
Metricas cross_validation LR(Recall): 0.4509777150031387


In [10]:
# generacion csv

id = pd.Series(range(4000))
df_result = pd.DataFrame({'ID': id, 'Class': y_val_pred_lr})

# Mostrar las primeras filas del nuevo dataset
df_result.to_csv('predicciones_LR.csv', index=False)

df_result2 = pd.DataFrame({'ID': id, 'Class': nb_predict})

# Mostrar las primeras filas del nuevo dataset
df_result2.to_csv('predicciones_NB.csv', index=False)


df_result3 = pd.DataFrame({'ID': id, 'Class': qda_predict})

# Mostrar las primeras filas del nuevo dataset
df_result3.to_csv('predicciones_QDA.csv', index=False)

df_result4 = pd.DataFrame({'ID': id, 'Class': lda_predict})

# Mostrar las primeras filas del nuevo dataset
df_result4.to_csv('predicciones_LDA.csv', index=False)
