## Выбор категориальный столбцов

In [None]:
categorical = data.select_dtypes(include=['object'])

## Корреляция пирсона

Какой столбец имеет наибольшую по модулю корреляцию Пирсона с целевой переменной price_range? В ответ запишите коэффициент корреляции (со знаком), округленный до сотых.

In [None]:
#create DataFrame
df = pd.DataFrame({'points': [25, 12, 15, 14, 19, 23, 25, 29],
 'assists': [5, 7, 7, 9, 12, 9, 9, 4],
 'rebounds': [11, 8, 10, 6, 6, 5, 9, 12]})

for column in df.columns:
  print(df[column].corr(df['assists']))


Коэффициент корреляции равен -0,359. Поскольку эта корреляция отрицательна,
это говорит нам о том, что очки и передачи имеют отрицательную корреляцию.

## Замена значений

In [None]:
values_to_change = {
    'Chorn': 'Churn',
    'Not chorn': 'Not churn'
}

print(train[(train['target'] == 'Not chorn') | (train['target'] == 'Chorn')].shape[0])
train['target'].replace(values_to_change, inplace=True)

## Столбцы с пропусками

In [None]:
column_with_most_nans = train.isna().sum().idxmax()
nan_columns = train.columns[train.isna().any()]
min_nan_column = train.columns[train[nan_columns].isna().sum().argmin()]

## Удаление строк

In [None]:
rows_with_nans = train[train[min_nan_column].isna()].index
train.drop(rows_with_nans, inplace=True)
print(f'Количество удаленных строк: {len(rows_with_nans)}')

## Количетсво уникальный значений

In [None]:
columns_with_few_values = train.drop(columns=['target']).nunique()
num_columns_with_few_values = (columns_with_few_values < 5).sum()
print(f'Количество столбцов с менее чем 5 различными значениями: {num_columns_with_few_values}')

## Условия

In [None]:
filtered_train = train[(train['2'] > train['2'].mean()) & (train['13'] < train['13'].median())]

## Число значений каждого вида в столбце

In [None]:
df["points"].value_counts()

## Кодировщики

### OneHoteEncoding

In [None]:
new_encoded_train = pd.get_dummies(train, columns=['cat_bio', 'education', 'meal', 'preparation course', 'type', 'group'], drop_first=True)
new_encoded_test = pd.get_dummies(test, columns=['cat_bio', 'education', 'meal', 'preparation course', 'type', 'group'], drop_first=True)

In [None]:
# OneHotEncoding для столбца touch_screen
encoder = OneHotEncoder(drop='first', sparse=False)
touch_screen_encoded = encoder.fit_transform(X[['touch_screen']])
touch_screen_encoded_df = pd.DataFrame(touch_screen_encoded, columns=encoder.get_feature_names_out(['touch_screen']))

# Замена старого столбца на закодированные данные
X = X.drop(columns=['touch_screen']).join(touch_screen_encoded_df)

### LabelEncoding

In [None]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'species'.
train['7'] = label_encoder.fit_transform(train['7'])
train['7'].unique()

## Модели

### Импорт моделей

In [None]:
from catboost import CatBoostClassifier()
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

### DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=5, splitter='best')

roc_auc_scorer = make_scorer(roc_auc_score)
scores = cross_val_score(model, X_encoded, y, cv=3, scoring="roc_auc")

mean_roc_auc = scores.mean()
rounded_mean_roc_auc = mean_roc_auc

print("Средний roc-auc по фолдам:", rounded_mean_roc_auc)

### CatBoostClassifier

In [None]:
model = CatBoostClassifier(random_state=42, verbose=0, cat_feature['14',])
param_grid = {
    'iterations': [100, 200, 300],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.3]
}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1')
grid_search.fit(X, y)
best_params = grid_search.best_params_
best_model = CatBoostClassifier(**best_params, random_state=42, verbose=0)
best_model.fit(X, y)
y_pred = best_model.predict(X_test)

f1 = f1_score(y_test, y_pred)

print(f'Наилучшие параметры: {best_params}')
print(f'F1-Score на тестовых данных: {round(f1, 2)}')

### LogisticRegression

In [None]:
model = LogisticRegression(random_state=42, max_iter=10000)
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
f1_scorer = make_scorer(f1_score)
grid_search = GridSearchCV(model, param_grid, cv=3, scoring=f1_scorer)
grid_search.fit(X, y)
best_C = grid_search.best_params_['C']
best_f1_score = grid_search.best_score_

## GridSearchCV

In [None]:
param_grid = {
    'C' : [0.001, 0.01, 1, 10, 100]
}

grid_search = GridSearchCV(model, param_grid, cv=3, scoring=f1_scorer)
grid_search.fit(X, y)
best_C = grid_search.best_params_['C']
print(f'Наилучшее значение C: {best_C}')

## Кроссвалидация

In [None]:
model = LogisticRegression(random_state=42)
f1_scorer = make_scorer(f1_score)
scores = cross_val_score(model, X, y, cv=3, scoring=f1_scorer)
mean_f1_score = scores.mean()

## Вывод результатов в CSV

In [None]:
best_model.fit(X, y)
predictions = best_model.predict(test)
test['target'] = predictions
test[['target']].to_csv("result.csv", index=False)