In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [67]:
df = pd.read_csv("ForestFires1.csv")
df.head(15)

Unnamed: 0,lat,lon,data_hora_gmt,satelite,pais
0,-9.02441,31.9325,2022-08-14 12:17:00,NOAA-20,Zambia
1,-9.18513,31.40951,2022-08-14 12:17:00,NOAA-20,Zambia
2,-12.82862,17.48835,2022-08-14 12:17:00,NOAA-20,Angola
3,-12.77537,21.02167,2022-08-14 12:17:00,NOAA-20,Angola
4,-12.77878,21.02223,2022-08-14 12:17:00,NOAA-20,Angola
5,-11.5793,19.87347,2022-08-14 12:17:00,NOAA-20,Angola
6,-11.87825,19.87436,2022-08-14 12:17:00,NOAA-20,Angola
7,-11.92603,20.45582,2022-08-14 12:17:00,NOAA-20,Angola
8,-11.92939,20.45636,2022-08-14 12:17:00,NOAA-20,Angola
9,-11.98693,19.98027,2022-08-14 12:17:00,NOAA-20,Angola


In [68]:
df.tail()


Unnamed: 0,lat,lon,data_hora_gmt,satelite,pais
1361844,-6.0,-53.63,2022-08-24 21:54:49,GOES-16,Brasil
1361845,-6.14,-52.85,2022-08-24 21:54:49,GOES-16,Brasil
1361846,-6.0,-53.65,2022-08-24 21:54:49,GOES-16,Brasil
1361847,-5.98,-53.63,2022-08-24 21:54:49,GOES-16,Brasil
1361848,-6.15,-53.32,2022-08-24 21:54:49,GOES-16,Brasil


In [69]:
# Data Cleaning and Preparation

In [70]:
df.columns

Index(['lat', 'lon', 'data_hora_gmt', 'satelite', 'pais'], dtype='object')

In [71]:
df.columns = df.columns.str.strip()
df.columns

Index(['lat', 'lon', 'data_hora_gmt', 'satelite', 'pais'], dtype='object')

In [72]:
df.shape


(1361849, 5)

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1361849 entries, 0 to 1361848
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   lat            1361849 non-null  float64
 1   lon            1361849 non-null  float64
 2   data_hora_gmt  1361849 non-null  object 
 3   satelite       1361849 non-null  object 
 4   pais           1361849 non-null  object 
dtypes: float64(2), object(3)
memory usage: 52.0+ MB


In [74]:
df.isnull().sum()

lat              0
lon              0
data_hora_gmt    0
satelite         0
pais             0
dtype: int64

In [75]:
df[df.isnull().any(axis = 1)]

Unnamed: 0,lat,lon,data_hora_gmt,satelite,pais


In [76]:
df['pais'].unique()

array(['Zambia', 'Angola', 'Democratic Republic of the Congo',
       'Republic of Congo', 'Tanzania', 'Malawi', 'Gabon', 'Portugal',
       'Brasil', 'Algeria', 'Canada', 'Italy', 'Tunisia', 'Morocco',
       'Spain', 'Croatia', 'France', 'Belgium', 'Slovakia',
       'United Kingdom', 'Denmark', 'Netherlands', 'Germany', 'Sweden',
       'Iceland', 'United States', 'South Africa', 'Botswana',
       'Mozambique', 'Zimbabwe', 'Bolivia', 'Australia', 'Fiji',
       'New Zealand', 'Japan', 'São Tomé and Príncipe', 'Russia',
       'Venezuela', 'Namibia', 'Senegal', 'Guyana', 'Paraguay', 'Peru',
       'Argentina', 'Suriname', 'Papua New Guinea', 'Malaysia',
       'Indonesia', 'China', 'Cuba', 'Colombia', 'Honduras', 'Nicaragua',
       'Timor-Leste', 'Uzbekistan', 'Kazakhstan', 'Uruguay', 'Mexico',
       'Dominican Republic', 'Mongolia', 'Taiwan', 'Vietnam',
       'Philippines', 'Iraq', 'Iran', 'Chile', 'Azerbaijan', 'Ukraine',
       'Ecuador', 'Costa Rica', 'Guatemala', 'Kyrgyzstan

In [77]:
df = pd.read_csv('ForestFires1.csv')
print(df.columns)

Index(['lat', 'lon', 'data_hora_gmt', 'satelite', 'pais'], dtype='object')


In [78]:
df.head(10)

Unnamed: 0,lat,lon,data_hora_gmt,satelite,pais
0,-9.02441,31.9325,2022-08-14 12:17:00,NOAA-20,Zambia
1,-9.18513,31.40951,2022-08-14 12:17:00,NOAA-20,Zambia
2,-12.82862,17.48835,2022-08-14 12:17:00,NOAA-20,Angola
3,-12.77537,21.02167,2022-08-14 12:17:00,NOAA-20,Angola
4,-12.77878,21.02223,2022-08-14 12:17:00,NOAA-20,Angola
5,-11.5793,19.87347,2022-08-14 12:17:00,NOAA-20,Angola
6,-11.87825,19.87436,2022-08-14 12:17:00,NOAA-20,Angola
7,-11.92603,20.45582,2022-08-14 12:17:00,NOAA-20,Angola
8,-11.92939,20.45636,2022-08-14 12:17:00,NOAA-20,Angola
9,-11.98693,19.98027,2022-08-14 12:17:00,NOAA-20,Angola


In [84]:
df = df.dropna()  # Drop rows with any missing values
df = df.dropna(subset=['satelite'])  # Drop rows with missing values in a specific column
df = df[df['satelite'] != '']  # Drop rows with empty values in a specific column

df.head(10)

Unnamed: 0,lat,lon,satelite,pais
0,-9.02441,31.9325,NOAA20,Zambia
1,-9.18513,31.40951,NOAA20,Zambia
2,-12.82862,17.48835,NOAA20,Angola
3,-12.77537,21.02167,NOAA20,Angola
4,-12.77878,21.02223,NOAA20,Angola
5,-11.5793,19.87347,NOAA20,Angola
6,-11.87825,19.87436,NOAA20,Angola
7,-11.92603,20.45582,NOAA20,Angola
8,-11.92939,20.45636,NOAA20,Angola
9,-11.98693,19.98027,NOAA20,Angola


In [85]:
df = df.fillna(0)  # Fill missing values with 0
df['satelite'] = df['satelite'].fillna('satelite')  # Fill missing values in a specific column with a default value
df['satelite'] = df['satelite'].replace('', 'satelite')  # Replace empty values in a specific column with a default value

df.head()

Unnamed: 0,lat,lon,satelite,pais
0,-9.02441,31.9325,NOAA20,Zambia
1,-9.18513,31.40951,NOAA20,Zambia
2,-12.82862,17.48835,NOAA20,Angola
3,-12.77537,21.02167,NOAA20,Angola
4,-12.77878,21.02223,NOAA20,Angola


In [81]:
# import pandas as pd

# df = pd.read_csv('ForestFires1.csv')
# df['satelite'] = df['satelite'].str.replace('[^0-9.]', '') # remove non-numeric characters
# df['satelite'] = df['satelite'].astype(float)

In [88]:
import pandas as pd

# read the dataset into a pandas dataframe
df = pd.read_csv('ForestFires1.csv')

# drop any rows with missing values
df.dropna(inplace=True)

# convert the 'data_hora_gmt' column to datetime format
df['data_hora_gmt'] = pd.to_datetime(df['data_hora_gmt'])

# drop the 'satelite' and 'pais' columns since they are not needed
df.drop(['data_hora_gmt'], axis=1, inplace=True)

# reset the index
df.reset_index(drop=True, inplace=True)


df['satelite'] = df['satelite'].str.replace('-', '')
df['lat'] = df['lat'].replace('-', '')


# save the cleaned dataset to a new CSV file
df.to_csv('cleaned_dataset.csv', index=False)


df.head()

Unnamed: 0,lat,lon,satelite,pais
0,-9.02441,31.9325,NOAA20,Zambia
1,-9.18513,31.40951,NOAA20,Zambia
2,-12.82862,17.48835,NOAA20,Angola
3,-12.77537,21.02167,NOAA20,Angola
4,-12.77878,21.02223,NOAA20,Angola


In [89]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# Load your dataset
df = pd.read_csv('cleaned_dataset.csv')

# Split the dataset into features and target variable
X = df.drop('lat', axis=1)
y = df['lat']

# float(satelite)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create the random forest classifier with 100 trees
rfc = RandomForestClassifier(n_estimators=100)

# Train the random forest classifier
rfc.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rfc.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


ValueError: could not convert string to float: 'AQUA_MM'