In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [43]:
data =pd.read_csv("cleaned_data.csv")
y= data["churn"]
X= data.drop(columns=["churn"])

In [44]:
sc=StandardScaler()
X_sc = sc.fit_transform(X)

In [45]:
X_scaled_df = pd.DataFrame(X_sc, columns=X.columns)

# Obliczenie macierzy korelacji z etykietą
corr_matrix = X_scaled_df.corrwith(y)

# Wybór istotnych cech na podstawie korelacji
threshold = 0.2  # Prog korelacji, który uznajemy za istotny

significant_features = corr_matrix[abs(corr_matrix) > threshold].index

X_sel = X_scaled_df[significant_features]

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [46]:
knn = KNeighborsClassifier(n_neighbors=5)

In [57]:
k_fold = 5
y_pred = cross_val_predict(knn, X_sc, y, cv=k_fold)

# Compute evaluation metrics
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.7012782885677326
Precision: 0.6783834658676193
Recall: 0.8818476903870163
F1-score: 0.7668490533263852


In [51]:
y_pred = cross_val_predict(knn, X_sel, y, cv=k_fold)

# Compute evaluation metrics
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9176136758794319
Precision: 0.9323266360941498
Recall: 0.9188014981273408
F1-score: 0.9255146568744577


In [52]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)


In [58]:
y_pred_rf = cross_val_predict(rfc, X_sc, y, cv=k_fold)
accuracy_rf = accuracy_score(y, y_pred_rf)
precision_rf = precision_score(y, y_pred_rf)
recall_rf = recall_score(y, y_pred_rf)
f1_rf = f1_score(y, y_pred_rf)

print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1-score:", f1_rf)

Accuracy: 0.5891115964002058
Precision: 0.5824572414875255
Recall: 0.9268414481897628
F1-score: 0.7153594141453072


In [55]:
y_pred_rf = cross_val_predict(rfc, X_sel, y, cv=k_fold)
accuracy_rf = accuracy_score(y, y_pred_rf)
precision_rf = precision_score(y, y_pred_rf)
recall_rf = recall_score(y, y_pred_rf)
f1_rf = f1_score(y, y_pred_rf)

print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1-score:", f1_rf)

Accuracy: 0.9110900922203831
Precision: 0.9270235980715554
Recall: 0.9122097378277153
F1-score: 0.9195570098162598


In [59]:
from sklearn.ensemble import GradientBoostingClassifier


In [60]:
gbc = GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)


In [62]:
y_pred_gbc = cross_val_predict(rfc, X_sc, y, cv=k_fold)
accuracy_gbc = accuracy_score(y, y_pred_gbc)
precision_gbc = precision_score(y, y_pred_gbc)
recall_gbc = recall_score(y, y_pred_gbc)
f1_gbc = f1_score(y, y_pred_gbc)

In [63]:
print("Accuracy:", accuracy_gbc)
print("Precision:", precision_gbc)
print("Recall:", recall_gbc)
print("F1-score:", f1_gbc)

Accuracy: 0.5891115964002058
Precision: 0.5824572414875255
Recall: 0.9268414481897628
F1-score: 0.7153594141453072


In [64]:
y_pred_gbc = cross_val_predict(rfc, X_sel, y, cv=k_fold)
accuracy_gbc = accuracy_score(y, y_pred_gbc)
precision_gbc = precision_score(y, y_pred_gbc)
recall_gbc = recall_score(y, y_pred_gbc)
f1_gbc = f1_score(y, y_pred_gbc)

In [65]:
print("Accuracy:", accuracy_gbc)
print("Precision:", precision_gbc)
print("Recall:", recall_gbc)
print("F1-score:", f1_gbc)

Accuracy: 0.9110900922203831
Precision: 0.9270235980715554
Recall: 0.9122097378277153
F1-score: 0.9195570098162598


In [66]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# Tworzenie modelu regresji liniowej
model = LinearRegression()

# Definiowanie liczby podziałów w k-krotnej walidacji krzyżowej
k = 5

# Inicjalizacja listy do przechowywania wyników
mse_scores = []

# K-krotna walidacja krzyżowa
kf = KFold(n_splits=k)
for train_index, test_index in kf.split(X_sel):
    X_train, X_test = X_sel.iloc[train_index], X_sel.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Trenowanie modelu
    model.fit(X_train, y_train)
    
    # Predykcja na danych testowych
    y_pred = model.predict(X_test)
    
    # Obliczenie błędu średniokwadratowego
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)

# Wyświetlenie wyników
print("Wyniki k-krotnej walidacji krzyżowej:")
for i, mse in enumerate(mse_scores):
    print(f"Fold {i+1}: MSE = {mse}")
print("Średni MSE:", np.mean(mse_scores))


Wyniki k-krotnej walidacji krzyżowej:
Fold 1: MSE = 0.0839982651148533
Fold 2: MSE = 0.08766010259203015
Fold 3: MSE = 0.11736693939597552
Fold 4: MSE = 0.17771078264093018
Fold 5: MSE = 0.13420504139583553
Średni MSE: 0.12018822622792494


In [None]:
Testowanie ANN

In [67]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential # used for init our ANN model
from tensorflow.keras.layers import Dense # used for different layer structure

In [90]:
classifier=Sequential()

In [91]:
classifier.add(Dense(10,activation='relu',input_dim=5))
classifier.add(Dense(10,activation='relu'))
classifier.add(Dense(1,activation='sigmoid'))

In [92]:
classifier.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 10)                60        
                                                                 
 dense_19 (Dense)            (None, 10)                110       
                                                                 
 dense_20 (Dense)            (None, 1)                 11        
                                                                 
Total params: 181
Trainable params: 181
Non-trainable params: 0
_________________________________________________________________


In [93]:
classifier.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])


In [94]:
k = 5

# Inicjalizacja list do przechowywania wyników
accuracy_scores = []
loss_scores = []

# K-krotna walidacja krzyżowa
kf = KFold(n_splits=k)
for train_index, test_index in kf.split(X_sel):
    X_train, X_test = X_sel.iloc[train_index], X_sel.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Trenowanie modelu
    history = classifier.fit(X_train, y_train, batch_size=50, epochs=100, verbose=0)

    # Ocena modelu na danych testowych
    loss, accuracy = classifier.evaluate(X_test, y_test, verbose=0)

    # Zapisanie wyników
    accuracy_scores.append(accuracy)
    loss_scores.append(loss)

# Wyświetlenie wyników
print("Wyniki k-krotnej walidacji krzyżowej:")
for i in range(k):
    print(f"Fold {i+1}: Loss = {loss_scores[i]}, Accuracy = {accuracy_scores[i]}")
print("Średnia Accuracy:", np.mean(accuracy_scores))
print("Średni Loss:", np.mean(loss_scores))

Wyniki k-krotnej walidacji krzyżowej:
Fold 1: Loss = 0.18271540105342865, Accuracy = 0.9365741610527039
Fold 2: Loss = 0.14385268092155457, Accuracy = 0.9630711674690247
Fold 3: Loss = 0.18476317822933197, Accuracy = 0.9464496970176697
Fold 4: Loss = 0.2956911027431488, Accuracy = 0.9033941030502319
Fold 5: Loss = 0.3450246751308441, Accuracy = 0.887049674987793
Średnia Accuracy: 0.9273077607154846
Średni Loss: 0.23040940761566162
