# Курсовой проект модуля "Библиотеки Python для Data Science"

Подключение библиотек и скриптов

### 1. Подключение библиотек и скриптов

In [155]:
import numpy as np
import pandas as pd
import random

In [156]:
import pickle

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV

from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [157]:
import warnings
warnings.filterwarnings('ignore')

In [158]:
matplotlib.rcParams.update({'font.size': 14})

In [159]:
def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):
    print("Train R2:\t" + str(round(r2(train_true_values, train_pred_values), 3)))
    print("Test R2:\t" + str(round(r2(test_true_values, test_pred_values), 3)))
    
    plt.figure(figsize=(18,10))
    
    plt.subplot(121)
    sns.scatterplot(x=train_pred_values, y=train_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Train sample prediction')
    
    plt.subplot(122)
    sns.scatterplot(x=test_pred_values, y=test_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Test sample prediction')

    plt.show()

### 2. Описание датасета

Этот набор данных содержит эпидемиологические данные о раке на уровне округов из исследования американского сообщества переписи населения США, предоставляет информацию о коэффициентах смертности с поправкой на возраст, среднем количестве смертей в год, недавних показателях тенденций, а также о том, соответствует ли каждый округ цели 45,5 смертей с поправкой на возраст. показатель на 100 000 человек

- Округ(**County**): название округа. (String)
- FIPS: Код федерального стандарта обработки информации для округа. (Integer)
- Достигнута ли цель 45,5 (1) (**Met Objective of 45.5? (1)**): Двоичное значение, указывающее, соответствует ли округ цели 45,5 коэффициента смертности с поправкой на возраст на 100 000 человек. (Boolean)
- Коэффициент смертности с поправкой на возраст (**Age-Adjusted Death Rate**): уровень смертности с поправкой на возраст на 100 000 человек. (Float)
- Среднее количество смертей в год (**Average Deaths per Year**): среднее количество смертей в год в округе. (Integer)
- Недавняя тенденция (2) (**Recent Trend (2)**): недавняя тенденция уровня смертности с поправкой на возраст на 100 000 человек. (Float)
- Недавняя 5-летняя тенденция (2) показателей смертности (**Recent 5-Year Trend (2) in Death Rates**): недавняя 5-летняя тенденция уровня смертности с поправкой на возраст на 100 000 человек. (Float)

### 3. Загрузка датасета

In [160]:
pd.read_csv('death .csv', delimiter=',')

Unnamed: 0.1,Unnamed: 0,index,County,FIPS,Met Objective of 45.5? (1),Age-Adjusted Death Rate,Lower 95% Confidence Interval for Death Rate,Upper 95% Confidence Interval for Death Rate,Average Deaths per Year,Recent Trend (2),Recent 5-Year Trend (2) in Death Rates,Lower 95% Confidence Interval for Trend,Upper 95% Confidence Interval for Trend
0,0,0,United States,0,No,46,45.9,46.1,157376,falling,-2.4,-2.6,-2.2
1,1,1,"Perry County, Kentucky",21193,No,125.6,108.9,144.2,43,stable,-0.6,-2.7,1.6
2,2,2,"Powell County, Kentucky",21197,No,125.3,100.2,155.1,18,stable,1.7,0,3.4
3,3,3,"North Slope Borough, Alaska",2185,No,124.9,73,194.7,5,**,**,**,**
4,4,4,"Owsley County, Kentucky",21189,No,118.5,83.1,165.5,8,stable,2.2,-0.4,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3136,3136,3136,"Yakutat City and Borough, Alaska<sup>3</sup>",2282,*,*,*,*,*,**,**,**,**
3137,3137,3137,"Yukon-Koyukuk Census Area, Alaska",2290,*,*,*,*,*,**,**,**,**
3138,3138,3138,"Zapata County, Texas",48505,*,*,*,*,*,*,*,*,*
3139,3139,3139,"Zavala County, Texas",48507,*,*,*,*,*,**,**,**,**


### 4. Загрузка функции

In [161]:
def optimizing_df(df):
    for col in df.columns:
        if df[col].dtypes.kind == 'i' or df[col].dtypes.kind == 'u':
            if df[col].min() >= 0:
                df[col] = pd.to_numeric(df[col], downcast='unsigned')
            else:
                df[col] = pd.to_numeric(df[col], downcast='integer')

        elif df[col].dtypes.kind == 'f' or df[col].dtypes.kind == 'c':
            df[col] = pd.to_numeric(df[col], downcast='float')

        elif df[col].dtypes.kind == 'O':
            num_unique_values = len(df[col].unique())
            num_total_values = len(df[col])
            if num_unique_values / num_total_values < 0.5:
                df[col] = df[col].astype('category')

    return df

### 5. Исследование датасета

In [162]:
df = pd.read_csv('death .csv', delimiter=',')
df.head()

Unnamed: 0.1,Unnamed: 0,index,County,FIPS,Met Objective of 45.5? (1),Age-Adjusted Death Rate,Lower 95% Confidence Interval for Death Rate,Upper 95% Confidence Interval for Death Rate,Average Deaths per Year,Recent Trend (2),Recent 5-Year Trend (2) in Death Rates,Lower 95% Confidence Interval for Trend,Upper 95% Confidence Interval for Trend
0,0,0,United States,0,No,46.0,45.9,46.1,157376,falling,-2.4,-2.6,-2.2
1,1,1,"Perry County, Kentucky",21193,No,125.6,108.9,144.2,43,stable,-0.6,-2.7,1.6
2,2,2,"Powell County, Kentucky",21197,No,125.3,100.2,155.1,18,stable,1.7,0,3.4
3,3,3,"North Slope Borough, Alaska",2185,No,124.9,73.0,194.7,5,**,**,**,**
4,4,4,"Owsley County, Kentucky",21189,No,118.5,83.1,165.5,8,stable,2.2,-0.4,4.8


In [163]:
df.tail()

Unnamed: 0.1,Unnamed: 0,index,County,FIPS,Met Objective of 45.5? (1),Age-Adjusted Death Rate,Lower 95% Confidence Interval for Death Rate,Upper 95% Confidence Interval for Death Rate,Average Deaths per Year,Recent Trend (2),Recent 5-Year Trend (2) in Death Rates,Lower 95% Confidence Interval for Trend,Upper 95% Confidence Interval for Trend
3136,3136,3136,"Yakutat City and Borough, Alaska<sup>3</sup>",2282,*,*,*,*,*,**,**,**,**
3137,3137,3137,"Yukon-Koyukuk Census Area, Alaska",2290,*,*,*,*,*,**,**,**,**
3138,3138,3138,"Zapata County, Texas",48505,*,*,*,*,*,*,*,*,*
3139,3139,3139,"Zavala County, Texas",48507,*,*,*,*,*,**,**,**,**
3140,3140,3140,"Ziebach County, South Dakota",46137,*,*,*,*,*,**,**,**,**


In [164]:
df.sample()

Unnamed: 0.1,Unnamed: 0,index,County,FIPS,Met Objective of 45.5? (1),Age-Adjusted Death Rate,Lower 95% Confidence Interval for Death Rate,Upper 95% Confidence Interval for Death Rate,Average Deaths per Year,Recent Trend (2),Recent 5-Year Trend (2) in Death Rates,Lower 95% Confidence Interval for Trend,Upper 95% Confidence Interval for Trend
1230,1230,1230,"Berrien County, Georgia",13019,No,54.4,41.4,70.3,12,falling,-2,-3.8,-0.2


***(!) Вывод: исследуя функции df.head() и df.tail(), обнаруживаем отсутсвующие данные***

***Оцениваем количество строк, столбцов:***

In [165]:
df.shape

(3141, 13)

***Выводим названия столбцов:***

In [166]:
df.columns

Index(['Unnamed: 0', 'index', 'County', 'FIPS', 'Met Objective of 45.5? (1)',
       'Age-Adjusted Death Rate',
       'Lower 95% Confidence Interval for Death Rate',
       'Upper 95% Confidence Interval for Death Rate',
       'Average Deaths per Year', 'Recent Trend (2)',
       'Recent 5-Year Trend (2) in Death Rates',
       'Lower 95% Confidence Interval for Trend',
       'Upper 95% Confidence Interval for Trend'],
      dtype='object')

***Возвращает информацию об индексе DataFrame:***

In [167]:
df.index

RangeIndex(start=0, stop=3141, step=1)

**Получение одного\нескольких признаков датасета:**

In [168]:
df[['Average Deaths per Year', 'Met Objective of 45.5? (1)', 'Recent Trend (2)']]

Unnamed: 0,Average Deaths per Year,Met Objective of 45.5? (1),Recent Trend (2)
0,157376,No,falling
1,43,No,stable
2,18,No,stable
3,5,No,**
4,8,No,stable
...,...,...,...
3136,*,*,**
3137,*,*,**
3138,*,*,*
3139,*,*,**


***(!) Вывод: признак 'Average Deaths per Year' предварительно можно определить как целевую пременную*** 

**Взятие наблюдения по индексу**

In [169]:
tmp = df[['Average Deaths per Year', 'Met Objective of 45.5? (1)', 'Recent Trend (2)']]
tmp.head()

Unnamed: 0,Average Deaths per Year,Met Objective of 45.5? (1),Recent Trend (2)
0,157376,No,falling
1,43,No,stable
2,18,No,stable
3,5,No,**
4,8,No,stable


In [170]:
tmp.index

RangeIndex(start=0, stop=3141, step=1)

In [171]:
tmp.loc[0: 2812]

Unnamed: 0,Average Deaths per Year,Met Objective of 45.5? (1),Recent Trend (2)
0,157376,No,falling
1,43,No,stable
2,18,No,stable
3,5,No,**
4,8,No,stable
...,...,...,...
2808,5,Yes,falling
2809,4,Yes,**
2810,37,Yes,falling
2811,7,Yes,falling


In [172]:
tmp.iloc[0: 2816]

Unnamed: 0,Average Deaths per Year,Met Objective of 45.5? (1),Recent Trend (2)
0,157376,No,falling
1,43,No,stable
2,18,No,stable
3,5,No,**
4,8,No,stable
...,...,...,...
2811,7,Yes,falling
2812,7,Yes,stable
2813,*,*,**
2814,*,*,*


In [173]:
tmp.iloc[-330:-1]

Unnamed: 0,Average Deaths per Year,Met Objective of 45.5? (1),Recent Trend (2)
2811,7,Yes,falling
2812,7,Yes,stable
2813,*,*,**
2814,*,*,*
2815,*,*,**
...,...,...,...
3135,*,*,**
3136,*,*,**
3137,*,*,**
3138,*,*,*


***(!) Вывод: гипотеза - ниже индекса 2812 отсутсвуют численные данные*** 

### 6. Приведение типов данных

In [174]:
df.dtypes

Unnamed: 0                                       int64
index                                            int64
County                                          object
FIPS                                             int64
Met Objective of 45.5? (1)                      object
Age-Adjusted Death Rate                         object
Lower 95% Confidence Interval for Death Rate    object
Upper 95% Confidence Interval for Death Rate    object
Average Deaths per Year                         object
Recent Trend (2)                                object
Recent 5-Year Trend (2) in Death Rates          object
Lower 95% Confidence Interval for Trend         object
Upper 95% Confidence Interval for Trend         object
dtype: object

***(!) Вывод: переменные index и FIPS - int64, остальные - object. Т.к. переменные index и FIPS не несут значимых данных, их можно удалить***

In [175]:
df['index'] = df['index'].astype(str)
df['index'].dtype

dtype('O')

In [176]:
df['FIPS'] = df['FIPS'].astype(str)
df['FIPS'].dtype

dtype('O')

In [177]:
df.dtypes

Unnamed: 0                                       int64
index                                           object
County                                          object
FIPS                                            object
Met Objective of 45.5? (1)                      object
Age-Adjusted Death Rate                         object
Lower 95% Confidence Interval for Death Rate    object
Upper 95% Confidence Interval for Death Rate    object
Average Deaths per Year                         object
Recent Trend (2)                                object
Recent 5-Year Trend (2) in Death Rates          object
Lower 95% Confidence Interval for Trend         object
Upper 95% Confidence Interval for Trend         object
dtype: object

**Фильтрация данных с помощью булевых масок**

In [195]:
df['Average Deaths per Year'] = df['Average Deaths per Year'].replace(',', '.')

In [202]:
df['Average Deaths per Year'] = df['Average Deaths per Year'].replace('str', 'float')
print(df)

      Unnamed: 0 index                                        County   FIPS  \
0              0     0                                 United States      0   
1              1     1                        Perry County, Kentucky  21193   
2              2     2                       Powell County, Kentucky  21197   
3              3     3                   North Slope Borough, Alaska   2185   
4              4     4                       Owsley County, Kentucky  21189   
...          ...   ...                                           ...    ...   
3136        3136  3136  Yakutat City and Borough, Alaska<sup>3</sup>   2282   
3137        3137  3137             Yukon-Koyukuk Census Area, Alaska   2290   
3138        3138  3138                          Zapata County, Texas  48505   
3139        3139  3139                          Zavala County, Texas  48507   
3140        3140  3140                  Ziebach County, South Dakota  46137   

     Met Objective of 45.5? (1) Age-Adjusted Death 

In [201]:
df['Average Deaths per Year'] = df['Average Deaths per Year'].astype(float)

ValueError: could not convert string to float: '157,376'

In [199]:
df[df['Average Deaths per Year'] > 4].head()

TypeError: '>' not supported between instances of 'str' and 'int'

In [200]:
df['Average Deaths per Year'] > 1

TypeError: '>' not supported between instances of 'str' and 'int'