# Data on the number of fires in Brazil

## Analysis of a dataset containing information on the number of fires in various regions of Brazil from 1998 to 2017

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn import decomposition
from tqdm.notebook import tqdm
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score, accuracy_score
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Read the data
data_raw = pd.read_csv('amazon.csv', encoding='latin1')
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6454 entries, 0 to 6453
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    6454 non-null   int64  
 1   state   6454 non-null   object 
 2   month   6454 non-null   object 
 3   number  6454 non-null   float64
 4   date    6454 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 252.2+ KB


Based on the analysis carried out in work №. 5 for this dataset, the fetaures by which predictive models can be built is the number of fires - a real parameter

## Data preparation

In [7]:
columns = ["year", "state", "month", "number"]
data1 = data_raw.loc[:, data_raw.columns.isin(columns)]

In [7]:
data1 = data1.drop(data1[data1['state'] == 'Rio'].index)
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5737 entries, 0 to 6453
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    5737 non-null   int64  
 1   state   5737 non-null   object 
 2   month   5737 non-null   object 
 3   number  5737 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 224.1+ KB


Create a column with month numbers

In [8]:
months_map = {'Janeiro' : 1, 'Fevereiro' : 2, 'Março' : 3, 'Abril' : 4, 'Maio' : 5, 'Junho' : 6,
             'Julho' : 7, 'Agosto' : 8, 'Setembro' : 9, 'Outubro' : 10, 'Novembro' : 11, 'Dezembro' : 12}

data1['month_number'] = data1['month']
for mon in months_map.keys():
    data1['month_number'] = np.where(data1['month_number'] == mon, int(months_map[mon]), data1['month_number'])

Now let's get dummy variables responsible for the time of year

In [9]:
data1['IsWinter'] = np.where(data1['month_number'] <= 2, 1, 0)
data1['IsWinter'] = np.where(data1['month_number'] == 12, 1, data1['IsWinter'])

data1['IsSpring'] = np.where(data1['month_number'] <= 5, 1, 0)
data1['IsSpring'] = np.where(data1['month_number'] <= 2, 0, data1['IsSpring'])

data1['IsSummer'] = np.where(data1['month_number'] <= 8, 1, 0)
data1['IsSummer'] = np.where(data1['month_number'] <= 5, 0, data1['IsSummer'])

In [10]:
data1['IsAutumn'] = np.where(data1['month_number'] >= 9, 1, 0)
data1['IsAutumn'] = np.where(data1['month_number'] == 12, 0, data1['IsAutumn'])

Since the column with the name of the states is the most numerous, we will also create generalized dummy variables for it, showing belonging to the district in accordance with the administrative division

In [11]:
rows_number = 5737
North_States = ['Acre', 'Amapa', 'Amazonas', 'Pará', 'Rondonia', 'Roraima']
N_S_map = {state : 1 for state in North_States}

data1['IsNorth'] = [0] * rows_number
for state in N_S_map.keys():
    data1['IsNorth'] =  np.where(data1['state'] == state, N_S_map[state], data1['IsNorth'])

In [12]:
North_West_States = ['Alagoas', 'Bahia', 'Ceara', 'Maranhao',
               'Paraiba', 'Pernambuco', 'Piau',
               'Sergipe', 'Tocantins']
N_W_S_map = {state : 1 for state in North_West_States}

data1['IsNorthWest'] = [0] * rows_number
for state in N_W_S_map.keys():
    data1['IsNorthWest'] =  np.where(data1['state'] == state, N_W_S_map[state], data1['IsNorthWest'])

In [13]:
Center_West_States = ['Distrito Federal', 'Goias','Mato Grosso']
C_W_map = {state : 1 for state in Center_West_States}

data1['IsCenterWest'] = [0] * rows_number
for state in C_W_map.keys():
    data1['IsCenterWest'] =  np.where(data1['state'] == state, C_W_map[state], data1['IsCenterWest'])

In [14]:
South_West_States = ['Espirito Santo', 'Minas Gerais', 'Sao Paulo']
S_W_map = {state : 1 for state in South_West_States}

data1['IsSouthWest'] = [0] * rows_number
for state in S_W_map.keys():
    data1['IsSouthWest'] =  np.where(data1['state'] == state, S_W_map[state], data1['IsSouthWest'])

In [15]:
data1['IsSouth'] = [0] * rows_number
data1['IsSouth'] =  np.where(data1['state'] == 'Santa Catarina', 1, data1['IsSouth'])

In [16]:
data1["number"] = data1["number"].apply(int)
data1["month_number"] = data1["month_number"].apply(int)

In [17]:
# Remove unnecessary variables from data
data1 = data1.drop(columns = ["month", "state", "month_number"], axis = 1)

Let's introduce a new dataset and normalize the columns with real values

In [497]:
columns3 = ["year", "number", "IsWinter", "IsSpring", "IsSummer", "IsAutumn",
            "IsNorth", "IsNorthWest", "IsCenterWest", "IsSouthWest", "IsSouth"]
data3 = data1.loc[:, data1.columns.isin(columns3)]

In [498]:
data3["number"] = (data3["number"] - data3["number"].mean())/data3["number"].std()
data3["year"] = (data3["year"] - data3["year"].mean())/data3["year"].std()

In [424]:
data3.head()

Unnamed: 0,year,number,IsWinter,IsSpring,IsSummer,IsAutumn,IsNorth,IsNorthWest,IsCenterWest,IsSouthWest,IsSouth
0,-1.646465,-0.576265,1,0,0,0,1,0,0,0,0
1,-1.472455,-0.576265,1,0,0,0,1,0,0,0,0
2,-1.298445,-0.576265,1,0,0,0,1,0,0,0,0
3,-1.124436,-0.576265,1,0,0,0,1,0,0,0,0
4,-0.950426,-0.576265,1,0,0,0,1,0,0,0,0


## Linear regression

Let's build a linear regression on this data

In [425]:
test = data3["number"]
train = data3.drop(['number'], axis=1) 
X_train, X_test, y_train, y_test = train_test_split(train, test, test_size = 0.3, random_state = 1)

In [389]:
linmodel = LinearRegression()
linmodel.fit(X_train,y_train)

LinearRegression()

In [391]:
linmodel.score(X_test, y_test)

0.7003484320557491

The accuracy of the resulting model is 70%. This is a fairly good value given the small number of variables describing the number of fires

## Classification

Let's try to solve the classification problem on these data. Let's split the values of the 'number' column into classes. Select 10 classes, the class number is one less than the number of hundreds of the value of the column 'number'

In [38]:
columns2 = ["year", "number", "IsWinter", "IsSpring", "IsSummer", "IsAutumn",
            "IsNorth", "IsNorthWest", "IsCenterWest", "IsSouthWest", "IsSouth"]
data2 = data1.loc[:, data1.columns.isin(columns2)]

In [39]:
data2['fire_level'] = data2['number']
data2['fire_level'] = np.where(data2['fire_level'] <= 99, 1, data2['fire_level'])
data2['fire_level'] = np.where(data2['fire_level'] >= 900, 10, data2['fire_level'])
data2['fire_level'] = np.where(data2['fire_level'] >= 800, 9, data2['fire_level'])
data2['fire_level'] = np.where(data2['fire_level'] >= 700, 8, data2['fire_level'])
data2['fire_level'] = np.where(data2['fire_level'] >= 600, 7, data2['fire_level'])
data2['fire_level'] = np.where(data2['fire_level'] >= 500, 6, data2['fire_level'])
data2['fire_level'] = np.where(data2['fire_level'] >= 400, 5, data2['fire_level'])
data2['fire_level'] = np.where(data2['fire_level'] >= 300, 4, data2['fire_level'])
data2['fire_level'] = np.where(data2['fire_level'] >= 200, 3, data2['fire_level'])
data2['fire_level'] = np.where(data2['fire_level'] >= 100, 2, data2['fire_level'])
data2["fire_level"] = data2["fire_level"].apply(int)

In [40]:
data2 = data2.drop(columns = ["number"], axis = 1)

In [395]:
data2.tail()

Unnamed: 0,year,IsWinter,IsSpring,IsSummer,IsNorth,IsNorthWest,IsCenterWest,IsSouthWest,IsSouth,fire_level
6449,2012,1,0,0,0,1,0,0,0,2
6450,2013,1,0,0,0,1,0,0,0,1
6451,2014,1,0,0,0,1,0,0,0,3
6452,2015,1,0,0,0,1,0,0,0,4
6453,2016,1,0,0,0,1,0,0,0,2


In [41]:
data2 = data2.sample(frac=1)

In [506]:
# Test and training sets
test = data2["fire_level"]
train = data2.drop(['fire_level'], axis=1) 
X_train, X_test, y_train, y_test = train_test_split(train, test, test_size = 0.3, random_state = 1)

### Decision tree

In [507]:
for i in range(1, 30):
    T = DecisionTreeClassifier(random_state=241, max_depth = i)
    T = T.fit(X_train, y_train)
    print(str(i) + ": " + str(T.score(X_test, y_test)))

1: 0.7171893147502904
2: 0.7171893147502904
3: 0.7171893147502904
4: 0.7171893147502904
5: 0.7131242740998839
6: 0.710801393728223
7: 0.7049941927990708
8: 0.7038327526132404
9: 0.7032520325203252
10: 0.7044134727061556
11: 0.7015098722415796
12: 0.7015098722415796
13: 0.7015098722415796
14: 0.7015098722415796
15: 0.7015098722415796
16: 0.7015098722415796
17: 0.7015098722415796
18: 0.7015098722415796
19: 0.7015098722415796
20: 0.7015098722415796
21: 0.7015098722415796
22: 0.7015098722415796
23: 0.7015098722415796
24: 0.7015098722415796
25: 0.7015098722415796
26: 0.7015098722415796
27: 0.7015098722415796
28: 0.7015098722415796
29: 0.7015098722415796


Note right away that the best accuracy is achieved with a tree with a depth of 1

In [508]:
Tree = DecisionTreeClassifier(random_state=241, max_depth=1)
Tree = Tree.fit(X_train, y_train)
report = classification_report(y_test, Tree.predict(X_test), target_names=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
print("Depth-optimized tree:\n", report)

Depth-optimized tree:
               precision    recall  f1-score   support

           1       0.72      1.00      0.84      1235
           2       0.00      0.00      0.00       178
           3       0.00      0.00      0.00        83
           4       0.00      0.00      0.00        57
           5       0.00      0.00      0.00        48
           6       0.00      0.00      0.00        28
           7       0.00      0.00      0.00        21
           8       0.00      0.00      0.00        24
           9       0.00      0.00      0.00        24
          10       0.00      0.00      0.00        24

    accuracy                           0.72      1722
   macro avg       0.07      0.10      0.08      1722
weighted avg       0.51      0.72      0.60      1722



We see that the classifier determines only objects of the first class. Moreover, by the value of the recall metric, we can conclude that it does not mark other classes for the first one

### K-nearest neighbor method

Let's try to build a classifier based on another algorithm

In [509]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Let's find the possible number of neighbors and find the optimal

In [510]:
ks = []
for k in range(1, 50, 2):
    classifier = KNeighborsClassifier(n_neighbors = k)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    ks.append((accuracy_score(y_test, y_pred)* 100, k))

max_accuracy = ks[0][0]
max_k = ks[0][1]
for i in range(1, len(ks)):
    if ks[i][0] > max_accuracy:
        max_accuracy = ks[i][0]
        max_k = ks[i][1]
        
print('Optimal number of nearest neighbors: ', max_k, '\n' + 'Maximum accuracy: ', max_accuracy)

Optimal number of nearest neighbors:  23 
Maximum accuracy:  71.71893147502904


In [511]:
classifier = KNeighborsClassifier(n_neighbors = max_k)
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=23)

In [512]:
report = classification_report(y_test, classifier.predict(X_test),
                               target_names=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
print(report)

              precision    recall  f1-score   support

           1       0.72      1.00      0.84      1235
           2       0.00      0.00      0.00       178
           3       0.00      0.00      0.00        83
           4       0.00      0.00      0.00        57
           5       0.00      0.00      0.00        48
           6       0.00      0.00      0.00        28
           7       0.00      0.00      0.00        21
           8       0.00      0.00      0.00        24
           9       0.00      0.00      0.00        24
          10       0.00      0.00      0.00        24

    accuracy                           0.72      1722
   macro avg       0.07      0.10      0.08      1722
weighted avg       0.51      0.72      0.60      1722



We see that in this case the same problem arises - only the first class is described. It arises because the classes are not sbanasirovanny - the first will greatly exceed the rest in terms of the number of objects, so the classifier learns to explain it more effectively

To solve this problem, we introduce an array of weights for each sample from the training set. Thus, we will balance the classes and make them equally significant for the classifier

## Array of weights

In [520]:
train_samples = np.array(y_train)
classes_sizes = data2['fire_level'].value_counts()
sample_weights = []
for i in range(sum(y_train.value_counts())):
    sample_weights.append(classes_sizes[1] / classes_sizes[train_samples[i]])

Проверим теперь работу классификаторов с массивом весов.

### Let's check the classifiers with an array of weights

In [521]:
clf = LogisticRegression(random_state=2, solver='lbfgs', multi_class='multinomial')
clf.fit(X_train, y_train, sample_weight=sample_weights)

LogisticRegression(multi_class='multinomial', random_state=2)

In [523]:
report = classification_report(y_test, clf.predict(X_test), target_names=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
print("Logistic regression:\n", report)

Logistic regression:
               precision    recall  f1-score   support

           1       0.89      0.29      0.44      1235
           2       0.17      0.29      0.22       178
           3       0.06      0.07      0.07        83
           4       0.03      0.02      0.02        57
           5       0.00      0.00      0.00        48
           6       0.02      0.07      0.03        28
           7       0.03      0.29      0.05        21
           8       0.06      0.33      0.10        24
           9       0.02      0.38      0.05        24
          10       0.02      0.08      0.04        24

    accuracy                           0.26      1722
   macro avg       0.13      0.18      0.10      1722
weighted avg       0.67      0.26      0.34      1722



Now we see that the classifier has worked out the prediction of almost all classes, except for one. However, the prediction accuracy is very small, it also dropped a lot for class 1. The precision metric for only two classes is more than 17%.
However, the accuracy of the entire classifier is 26%, the classifier is more effective than random class selection (the probability of guessing one class out of 10 is 10%)

### Decision Tree with Array of Weights

In [528]:
for i in range(1, 30):
    T = DecisionTreeClassifier(random_state=241, max_depth = i)
    T = T.fit(X_train, y_train, sample_weight=sample_weights)
    print(str(i) + ": " + str(T.score(X_test, y_test)))

1: 0.21777003484320556
2: 0.22125435540069685
3: 0.1951219512195122
4: 0.18641114982578397
5: 0.21254355400696864
6: 0.19279907084785133
7: 0.20615563298490128
8: 0.18641114982578397
9: 0.15447154471544716
10: 0.16840882694541232
11: 0.17015098722415795
12: 0.16666666666666666
13: 0.17015098722415795
14: 0.17015098722415795
15: 0.17073170731707318
16: 0.17073170731707318
17: 0.17073170731707318
18: 0.17073170731707318
19: 0.17073170731707318
20: 0.17073170731707318
21: 0.17073170731707318
22: 0.17073170731707318
23: 0.17073170731707318
24: 0.17073170731707318
25: 0.17073170731707318
26: 0.17073170731707318
27: 0.17073170731707318
28: 0.17073170731707318
29: 0.17073170731707318


In [529]:
Tree = DecisionTreeClassifier(random_state=241, max_depth=5)
Tree = Tree.fit(X_train, y_train, sample_weight=sample_weights)
report = classification_report(y_test, Tree.predict(X_test), target_names=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
print("Depth-optimized tree:\n", report)

Depth-optimized tree:
               precision    recall  f1-score   support

           1       0.91      0.26      0.40      1235
           2       0.18      0.12      0.15       178
           3       0.06      0.19      0.09        83
           4       0.00      0.00      0.00        57
           5       0.00      0.00      0.00        48
           6       0.01      0.04      0.01        28
           7       0.01      0.38      0.03        21
           8       0.03      0.08      0.04        24
           9       0.01      0.08      0.02        24
          10       0.00      0.00      0.00        24

    accuracy                           0.21      1722
   macro avg       0.12      0.12      0.07      1722
weighted avg       0.67      0.21      0.31      1722



### Random forest with array of weights

In [530]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth' : [2, 5, 10, 15, 20],
    'random_state': [241],
}
scoring = ['accuracy', 'f1', 'precision', 'recall']

RFC = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, scoring=scoring, refit='f1') 
RFC.fit(X_train, y_train, sample_weight=sample_weights)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 5, 10, 15, 20],
                         'n_estimators': [50, 100, 150],
                         'random_state': [241]},
             refit='f1', scoring=['accuracy', 'f1', 'precision', 'recall'])

In [531]:
best_RFC = RFC.best_estimator_
best_RFC.fit(X_train, y_train, sample_weight=sample_weights)

report = classification_report(y_test, best_RFC.predict(X_test), target_names=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
print("Best RFC:\n", report)

Best RFC:
               precision    recall  f1-score   support

           1       0.92      0.27      0.42      1235
           2       0.16      0.39      0.22       178
           3       0.00      0.00      0.00        83
           4       0.00      0.00      0.00        57
           5       0.00      0.00      0.00        48
           6       0.04      0.25      0.06        28
           7       0.01      0.24      0.03        21
           8       0.08      0.25      0.12        24
           9       0.02      0.12      0.03        24
          10       0.02      0.04      0.03        24

    accuracy                           0.25      1722
   macro avg       0.12      0.16      0.09      1722
weighted avg       0.68      0.25      0.33      1722



The results of random forest and decision tree are similar to the result of logistic regression, only in these cases even fewer classes were predicted by the classifier. For 3 classes there is no prediction at all

We can conclude that splitting into many classes in a given dataframe is not effective, it is better to use linear regression to predict the number of fires

## Split into 2 classes

Let's try to increase the accuracy by reducing the number of classes. Let's divide the data into fire hazardous situations and not. The first category includes objects for which the number of fires is more than 250

In [8]:
columns4 = ["year", "number", "IsWinter", "IsSpring", "IsSummer", "IsAutumn",
            "IsNorth", "IsNorthWest", "IsCenterWest", "IsSouthWest", "IsSouth"]
data4 = data1.loc[:, data1.columns.isin(columns4)]

In [9]:
data4['fire_level'] = data4['number']
data4['fire_level'] = np.where(data4['fire_level'] <= 250 , 0, 1)
data4 = data4.drop(columns = ["number"], axis = 1)

In [10]:
# Train & test splits
test = data4["fire_level"]
train = data4.drop(['fire_level'], axis=1) 
X_train, X_test, y_train, y_test = train_test_split(train, test, test_size = 0.3, random_state = 1)

In [76]:
# Array of weights for the given sample
train_samples = np.array(y_train)
classes_sizes = data4['fire_level'].value_counts()
sample_weights = []
for i in range(sum(y_train.value_counts())):
    sample_weights.append(classes_sizes[0] / classes_sizes[train_samples[i]])

In [77]:
for i in range(1, 30):
    T = DecisionTreeClassifier(random_state=241, max_depth = i)
    T = T.fit(X_train, y_train, sample_weight=sample_weights)
    print(str(i) + ": " + str(T.score(X_test, y_test)))

1: 0.3821138211382114
2: 0.5545876887340302
3: 0.5586527293844367
4: 0.5952380952380952
5: 0.5842044134727061
6: 0.5766550522648084
7: 0.5418118466898955
8: 0.5720092915214866
9: 0.5714285714285714
10: 0.5830429732868757
11: 0.5807200929152149
12: 0.5813008130081301
13: 0.5859465737514518
14: 0.5905923344947736
15: 0.5876887340301974
16: 0.5923344947735192
17: 0.5923344947735192
18: 0.5923344947735192
19: 0.5923344947735192
20: 0.5923344947735192
21: 0.5923344947735192
22: 0.5923344947735192
23: 0.5923344947735192
24: 0.5923344947735192
25: 0.5923344947735192
26: 0.5923344947735192
27: 0.5923344947735192
28: 0.5923344947735192
29: 0.5923344947735192


In [69]:
Tree = DecisionTreeClassifier(random_state=241, max_depth=4)
Tree = Tree.fit(X_train, y_train, sample_weight=sample_weights)

In [70]:
report = classification_report(y_test, Tree.predict(X_test), target_names=['0', '1'])
print("Depth-optimized tree:\n", report)

Depth-optimized tree:
               precision    recall  f1-score   support

           0       0.92      0.57      0.70      1456
           1       0.24      0.73      0.36       266

    accuracy                           0.60      1722
   macro avg       0.58      0.65      0.53      1722
weighted avg       0.81      0.60      0.65      1722



Prediction accuracy for two classes is low - 60%. Specifically for class 0, data with less than 250 fires, the f1 metric value is 70%, which is almost equal to the linear regression value

Thus, we can conclude that it is better to use a linear model to predict the number of fires. If there is a task of conditional division into fire hazardous situations and not, then a classifier can be used, because it has a high value of the precision metric for non-fire objects - 92%, which means that it rarely marks objects of another class as non-flammable