In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Загружаем данные


In [2]:
df = pd.read_csv('seattle-weather.csv')

### Основная информация о данных

In [3]:
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [5]:
df.describe()

Unnamed: 0,precipitation,temp_max,temp_min,wind
count,1461.0,1461.0,1461.0,1461.0
mean,3.029432,16.439083,8.234771,3.241136
std,6.680194,7.349758,5.023004,1.437825
min,0.0,-1.6,-7.1,0.4
25%,0.0,10.6,4.4,2.2
50%,0.0,15.6,8.3,3.0
75%,2.8,22.2,12.2,4.0
max,55.9,35.6,18.3,9.5


### Конвертация колонки с датой. Выделение признака

In [6]:
df['date'] = pd.to_datetime(df['date'])

In [7]:
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter

In [8]:
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather,month,quarter
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle,1,1
1,2012-01-02,10.9,10.6,2.8,4.5,rain,1,1
2,2012-01-03,0.8,11.7,7.2,2.3,rain,1,1
3,2012-01-04,20.3,12.2,5.6,4.7,rain,1,1
4,2012-01-05,1.3,8.9,2.8,6.1,rain,1,1


### Создание колонки с таргетом. Приведение ее к нужному типу

In [9]:
df['weather'].value_counts()

Unnamed: 0_level_0,count
weather,Unnamed: 1_level_1
rain,641
sun,640
fog,101
drizzle,53
snow,26


In [15]:
df.loc[df['weather'] == 'fog', 'target'] = 1

In [19]:
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather,month,quarter,target
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle,1,1,0
1,2012-01-02,10.9,10.6,2.8,4.5,rain,1,1,0
2,2012-01-03,0.8,11.7,7.2,2.3,rain,1,1,0
3,2012-01-04,20.3,12.2,5.6,4.7,rain,1,1,0
4,2012-01-05,1.3,8.9,2.8,6.1,rain,1,1,0


In [17]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1.0,741
0.0,720


In [18]:
df['target'] = df['target'].astype(int)

### Удаление ненужных колонок и разделение датасета

In [20]:
X = df.drop(['date','target','weather'], axis=1)
Y = df['target']


### Логистическая регрессия : деление сета


In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.15)

### Логистическая регрессия : Тренировка и прогноз

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [24]:
pipe = Pipeline(
    [
        ('Scaler', StandardScaler()),
        ('LR', LogisticRegression())
    ]
)

In [25]:
pipe.fit(X_train, Y_train)

In [26]:
predict = pipe.predict(X_test)

### Оценка работы модели

In [27]:
pipe.score(X_test, Y_test)

0.8863636363636364

### Результаты

In [28]:
X_test.head()

Unnamed: 0,precipitation,temp_max,temp_min,wind,month,quarter
463,0.8,13.3,6.1,3.1,4,2
242,0.0,22.8,12.8,1.9,8,3
526,0.0,21.7,11.7,3.2,6,2
332,2.8,9.4,2.2,2.9,11,4
1097,1.5,5.6,0.0,2.3,1,1


In [29]:
Y_test.head()

Unnamed: 0,target
463,0
242,1
526,1
332,0
1097,0


In [30]:
predict[:5]

array([0, 1, 1, 0, 0])

In [31]:
pipe.classes_

array([0, 1])

In [33]:
pipe.predict_proba(X_test)[0:5]

array([[0.55666614, 0.44333386],
       [0.18940783, 0.81059217],
       [0.22192885, 0.77807115],
       [0.9209289 , 0.0790711 ],
       [0.83518628, 0.16481372]])

In [35]:
pipe.decision_function(X_test)[0:5]

array([-0.22764256,  1.4538625 ,  1.25446113, -2.45503537, -1.62283892])

In [38]:
def sigmoid(x):
  return (1 / (1 + np.exp(x)))

In [39]:
sigmoid(-0.22764256)

0.5566661424388795