<a href="https://colab.research.google.com/github/Jek5231/Jek5231/blob/main/3_%D0%B7%D0%B0%D0%B4%D0%B0%D0%BD%D0%B8%D0%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

#Загрузка данных
data = pd.read_csv('churn_data.csv')

#Удаление столбца customerID, так как он не несет полезной информации
data = data.drop(columns=['customerID'])

#Проверка на пропущенные значения
print(data.isnull().sum())

#Очистка столбца TotalCharges от нечисловых символов
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'].str.replace(' ', ''), errors='coerce')

#Заполнение пропущенных значений в столбце TotalCharges средним значением
data['TotalCharges'] = data['TotalCharges'].fillna(data['TotalCharges'].mean())

#Разделение признаков и целевой переменной
X = data.drop(columns=['Churn'])
y = data['Churn']

#Преобразование категориальных признаков в числовые
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(exclude=['object']).columns

#Создание пайплайна для предобработки данных
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

#Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Выбор модели и обучение
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
#Обучаем
model.fit(X_train, y_train)

#Оценка модели
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

tenure              0
PhoneService        0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
Accuracy: 0.7814052519517388
Classification Report:
               precision    recall  f1-score   support

          No       0.83      0.88      0.86      1036
         Yes       0.60      0.51      0.55       373

    accuracy                           0.78      1409
   macro avg       0.72      0.70      0.71      1409
weighted avg       0.77      0.78      0.78      1409

