In [2]:
import pandas as pd

import scipy
from scipy import sparse

import numpy as np
from sklearn.model_selection import TimeSeriesSplit

import pickle


### Read data

In [3]:
weather_alarms_tfidf = pd.read_csv('data/df_weather+alarms_merged.csv')

In [4]:
weather_alarms_tfidf.shape

(203420, 70)

In [5]:
weather_alarms_tfidf.head()

Unnamed: 0.1,Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,...,event_day_date,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch,isw_report_date,isw_date_tomorrow_datetime,isw_keywords,isw_main_html_v6,isw_report_text_lemm
0,0,"Черкаси, Україна",2022-02-24,1645653600,4.9,-0.1,1.9,-0.6,83.4,0.0,...,,,,,,,,,,
1,1,"Черкаси, Україна",2022-02-24,1645653600,4.9,-0.1,1.9,-0.6,83.4,0.0,...,,,,,,,,,,
2,2,"Черкаси, Україна",2022-02-24,1645653600,4.9,-0.1,1.9,-0.6,83.4,0.0,...,,,,,,,,,,
3,3,"Черкаси, Україна",2022-02-24,1645653600,4.9,-0.1,1.9,-0.6,83.4,0.0,...,,,,,,,,,,
4,4,"Черкаси, Україна",2022-02-24,1645653600,4.9,-0.1,1.9,-0.6,83.4,0.0,...,,,,,,,,,,


In [6]:
weather_alarms_tfidf.columns

Index(['Unnamed: 0', 'city_resolvedAddress', 'day_datetime',
       'day_datetimeEpoch', 'day_tempmax', 'day_tempmin', 'day_temp',
       'day_dew', 'day_humidity', 'day_precip', 'day_precipcover',
       'day_solarradiation', 'day_solarenergy', 'day_uvindex', 'day_sunrise',
       'day_sunset', 'day_moonphase', 'hour_datetime', 'hour_datetimeEpoch',
       'hour_temp', 'hour_humidity', 'hour_dew', 'hour_precip',
       'hour_precipprob', 'hour_snow', 'hour_snowdepth', 'hour_preciptype',
       'hour_windgust', 'hour_windspeed', 'hour_winddir', 'hour_pressure',
       'hour_visibility', 'hour_cloudcover', 'hour_solarradiation',
       'hour_solarenergy', 'hour_uvindex', 'hour_severerisk',
       'hour_conditions', 'city', 'region', 'center_city_ua', 'center_city_en',
       'region_alt', 'region_id', 'event_Unnamed: 0', 'event_region_title',
       'event_region_city', 'event_all_region', 'event_start', 'event_end',
       'event_clean_end', 'event_intersection_alarm_id', 'event_start_

In [7]:
weather_alarms_tfidf.shape

(203420, 70)

### TF_IDF + CV with isw_report_text_lemm preprocessed

In [8]:
tfidf = pickle.load(open("models/tfidf_transformer_v1.pkl","rb"))
cv = pickle.load(open("models/count_vectorizer_v1.pkl","rb"))

#Uncommit if you want to load exist matrix
tfidf_vector = scipy.sparse.load_npz('data/matrix/tfidf_vector_train.npz')

# #Uncommit if you want to create new one
# word_count_vector = cv.transform(weather_alarms_tfidf['isw_report_text_lemm'].values.astype('U'))
# tfidf_vector = tfidf.transform(word_count_vector)

  tfidf = pickle.load(open("models/tfidf_transformer_v1.pkl","rb"))
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [9]:
# #Save model and matrix
# with open("models/tfidf_vector_calculated.pkl", "wb") as handle:
#    pickle.dump(tfidf_vector, handle)
# scipy.sparse.save_npz('data/matrix/tfidf_vector_train.npz', tfidf_vector)

In [10]:
tfidf_vector

<203420x7401 sparse matrix of type '<class 'numpy.float64'>'
	with 123370706 stored elements in Compressed Sparse Row format>

### Dataset Preprocesing

In [11]:
weather_alarms_tfidf['isAlarm'] = weather_alarms_tfidf['event_start_hour'].notnull().astype(int)
Y = weather_alarms_tfidf['isAlarm']
weather_alarms_tfidf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203420 entries, 0 to 203419
Data columns (total 71 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   Unnamed: 0                                      203420 non-null  int64  
 1   city_resolvedAddress                            203420 non-null  object 
 2   day_datetime                                    203420 non-null  object 
 3   day_datetimeEpoch                               203420 non-null  int64  
 4   day_tempmax                                     203420 non-null  float64
 5   day_tempmin                                     203420 non-null  float64
 6   day_temp                                        203420 non-null  float64
 7   day_dew                                         203420 non-null  float64
 8   day_humidity                                    203420 non-null  float64
 9   day_precip                

In [12]:
#Clear data
weather_alarms_tfidf_features = weather_alarms_tfidf.drop(columns=['event_all_region', 'event_start',
       'event_end', 'event_clean_end', 'event_intersection_alarm_id',
       'event_start_time', 'event_end_time', 'event_day',
       'event_feature_number_of_region', 'event_within_24_hours',
       'event_feature_number_of_alarms_within_24_hours', 'event_start_hour',
       'event_end_hour', 'event_day_date', 'event_start_hour_datetimeEpoch',
       'event_end_hour_datetimeEpoch', 'event_hour_level_event_time',
       'event_hour_level_event_datetimeEpoch','region_alt','hour_conditions','center_city_en','hour_datetime','day_sunset','day_sunrise','isAlarm','hour_precip','hour_solarradiation','hour_solarenergy','hour_uvindex', 'event_region_title', 'event_region_city', 'isw_report_date', 'isw_date_tomorrow_datetime', 'isw_keywords', 'isw_main_html_v6', 'isw_report_text_lemm', 'city', 'region', 'center_city_ua', 'hour_preciptype', 'city_resolvedAddress', 'day_datetime', 'event_Unnamed: 0'],axis=1)

In [13]:
weather_alarms_tfidf_features_clear = weather_alarms_tfidf_features
weather_alarms_tfidf_features_clear.to_csv('data/weather_alarms_tfidf_features_clear.csv')

In [14]:
weather_alarms_tfidf_features_clear.isnull().sum()

Unnamed: 0            0
day_datetimeEpoch     0
day_tempmax           0
day_tempmin           0
day_temp              0
day_dew               0
day_humidity          0
day_precip            0
day_precipcover       0
day_solarradiation    0
day_solarenergy       0
day_uvindex           0
day_moonphase         0
hour_datetimeEpoch    0
hour_temp             0
hour_humidity         0
hour_dew              0
hour_precipprob       0
hour_snow             0
hour_snowdepth        0
hour_windgust         0
hour_windspeed        0
hour_winddir          0
hour_pressure         0
hour_visibility       0
hour_cloudcover       0
hour_severerisk       0
region_id             0
dtype: int64

In [15]:
weather_alarms_tfidf_csr = scipy.sparse.csr_matrix(weather_alarms_tfidf_features_clear)
weather_alarms_tfidf_features = scipy.sparse.hstack((weather_alarms_tfidf_csr, tfidf_vector), format="csr")

In [16]:
scipy.sparse.save_npz('data/matrix/weather_alarms_tfidf_features.npz', weather_alarms_tfidf_features)

In [17]:
weather_alarms_tfidf_features

<203420x7429 sparse matrix of type '<class 'numpy.float64'>'
	with 128262032 stored elements in Compressed Sparse Row format>

### Devide data into train and test parts and test models

In [18]:
#Imports
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # For SVM
from sklearn.linear_model import SGDClassifier # For SGD
from sklearn.ensemble import RandomForestClassifier # For Random Forest
from sklearn.naive_bayes import GaussianNB # For Gaussian Naive Bayes
from sklearn.neighbors import KNeighborsClassifier # For K-nearest neighbors
from sklearn.model_selection import train_test_split # For data splitting
from sklearn.metrics import accuracy_score, classification_report # For model evaluation
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils import shuffle

COMMAND_ID = "4"

# Split the dataset into input features (X) and target variable (y)
X = weather_alarms_tfidf_features  # Assuming the target variable is in the last column
y = Y  # Assuming the target variable is in the last column

X, y = shuffle(X, y, random_state=42)

In [19]:
stop

NameError: name 'stop' is not defined

### Model 1 SGDClassifier

In [None]:
#25000/5000 - train/test data
sgd = SGDClassifier()

sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_test)
sgd_accuracy = accuracy_score(y_test, sgd_pred)
sgd_report = classification_report(y_test, sgd_pred)

print("SGDClassifier Accuracy:", sgd_accuracy)
print("SGDClassifier Report:\n", sgd_report)

# Save model
MODEL_NAME = "sgd"
VERSION = "1.4"
with open(f'models/training_models/{COMMAND_ID}_{MODEL_NAME}_{VERSION}.pkl', 'wb') as f:
    pickle.dump(sgd, f)

### Model 2 LogisticRegression

In [None]:
#25000/5000 - train/test data
lr = LogisticRegression()

lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
lr_report = classification_report(y_test, lr_pred)

print("LogisticRegression Accuracy:", lr_accuracy)
print("LogisticRegression Report:\n", lr_report)

# Save model
MODEL_NAME = "logreg"
VERSION = "1.5f" # f - full data set
with open(f'models/training_models/{COMMAND_ID}_{MODEL_NAME}_{VERSION}.pkl', 'wb') as f:
    pickle.dump(lr, f)

### Model 3 GaussianNB

In [None]:
tscv = TimeSeriesSplit(n_splits=3)

for train_index, test_index in tscv.split(X):

    # Access the train and test data for X and y
    X_train, X_test = X[train_index].toarray(), X[test_index].toarray()
    y_train, y_test = y[train_index], y[test_index]

    # show sets size
    print(X_train.shape)
    print(X_test.shape)

    gnb = GaussianNB()

    gnb.fit(X_train, y_train)
    gnb_pred = gnb.predict(X_test)
    gnb_accuracy = accuracy_score(y_test, gnb_pred)
    gnb_report = classification_report(y_test, gnb_pred)

    print("GaussianNB Accuracy:", gnb_accuracy)
    print("GaussianNB Report:\n",gnb_report)

In [None]:
#Save model
MODEL_NAME = "gnb"
VERSION = "1.3h" # h - half data set
with open(f'models/training_models/{COMMAND_ID}_{MODEL_NAME}_{VERSION}.pkl', 'wb') as f:
    pickle.dump(gnb, f)

### Model 4 RandomForestClassifier

In [23]:
tscv = TimeSeriesSplit(n_splits=3, max_train_size=152565, test_size=50855)

for train_index, test_index in tscv.split(X):
    # Access the train and test data for X and y
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

print(X_train.shape)
print(X_test.shape)

rf = RandomForestClassifier()

rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_report = classification_report(y_test, rf_pred)

print("RandomForestClassifier Accuracy:", rf_accuracy)
print("RandomForestClassifier Report:\n", rf_report)

# Save model
MODEL_NAME = "rf"
VERSION = "3.0f"
with open(f'models/training_models/{COMMAND_ID}_{MODEL_NAME}_{VERSION}.pkl', 'wb') as f:
    pickle.dump(rf, f)


(152565, 7429)
(50855, 7429)
RandomForestClassifier Accuracy: 0.7202438304984761
RandomForestClassifier Report:
               precision    recall  f1-score   support

           0       0.82      0.85      0.83     41633
           1       0.18      0.15      0.17      9222

    accuracy                           0.72     50855
   macro avg       0.50      0.50      0.50     50855
weighted avg       0.70      0.72      0.71     50855



### Model 5 SVC

In [None]:
tscv = TimeSeriesSplit(n_splits=10)
circle = 0

for train_index, test_index in tscv.split(X):

    # Access the train and test data for X and y
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    print(X_train.shape)
    print(X_test.shape)

    svm = SVC()

    svm.fit(X_train, y_train)
    svm_pred = svm.predict(X_test)
    svm_accuracy = accuracy_score(y_test, svm_pred)
    svm_report = classification_report(y_test, svm_pred)

    print("SVC Accuracy:", svm_accuracy)
    print("SVC Report:\n", svm_report)

    # Save model
    MODEL_NAME = "svm"
    VERSION = "2."+str(circle)
    with open(f'models/training_models/{COMMAND_ID}_{MODEL_NAME}_{VERSION}.pkl', 'wb') as f:
        pickle.dump(svm, f)

    circle=circle+1

### Model 6 KNeighborsClassifier

In [None]:
#25000/5000 - train/test data
tscv = TimeSeriesSplit(n_splits=2, max_train_size=5000, test_size=1000)
circle = 0

# for train_index, test_index in tscv.split(X):
    # # Access the train and test data for X and y
    # X_train, X_test = X[train_index], X[test_index]
    # y_train, y_test = y[train_index], y[test_index]
    #
    # print(X_train.shape)
    # print(X_test.shape)
    #
    # knn = KNeighborsClassifier()
    #
    # knn.fit(X_train, y_train)
    # knn_pred = knn.predict(X_test)
    # knn_accuracy = accuracy_score(y_test, knn_pred)
    # knn_report = classification_report(y_test, knn_pred)
    #
    # print("KNeighborsClassifier Accuracy:", knn_accuracy)
    # print("KNeighborsClassifier Report:\n", knn_report)
    #
    # # Save model
    # MODEL_NAME = "knn"
    # VERSION = "2."+str(circle)
    # with open(f'models/training_models/{COMMAND_ID}_{MODEL_NAME}_{VERSION}.pkl', 'wb') as f:
    #     pickle.dump(knn, f)
    # circle=circle+1

# Access the train and test data for X and y
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

print(X_train.shape)
print(X_test.shape)

knn = KNeighborsClassifier()

knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_report = classification_report(y_test, knn_pred)

print("KNeighborsClassifier Accuracy:", knn_accuracy)
print("KNeighborsClassifier Report:\n", knn_report)

# Save model
MODEL_NAME = "knn"
VERSION = "3."+str(circle)
with open(f'models/training_models/{COMMAND_ID}_{MODEL_NAME}_{VERSION}.pkl', 'wb') as f:
    pickle.dump(knn, f)