In [18]:
# Kurulum
!pip install earthaccess
!pip install netCDF4

# Temel kullanım
import earthaccess
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import netCDF4 as nc

# NASA Earthdata giriş (ücretsiz hesap gerekli)
auth = earthaccess.login(persist=True)




In [19]:
results = earthaccess.search_data(
    short_name="TEMPO_NO2_L3",
    version="V03",
    temporal=("2024-09-01 00:00:00", "2024-09-01 23:59:59"),
    bounding_box=(-96.0, 32.0, -89.0, 44.0)
)

# Verileri indirme
files = earthaccess.download(results, local_path="./tempo_data")

QUEUEING TASKS | :   0%|          | 0/18 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/18 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/18 [00:00<?, ?it/s]

In [20]:
df=pd.read_csv('Nasadata.csv')

In [21]:
df.head(3)

Unnamed: 0,City,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,Air Quality
0,Dallas,29.8,59.1,5.2,17.9,18.9,9.2,1.72,6.3,319,Moderate
1,Vienna,28.3,75.6,2.3,12.2,30.8,9.7,1.64,6.0,611,Moderate
2,Singapore,23.1,74.7,26.7,33.8,24.4,12.6,1.63,5.2,619,Moderate


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   City                           5000 non-null   object 
 1   Temperature                    5000 non-null   float64
 2   Humidity                       5000 non-null   float64
 3   PM2.5                          5000 non-null   float64
 4   PM10                           5000 non-null   float64
 5   NO2                            5000 non-null   float64
 6   SO2                            5000 non-null   float64
 7   CO                             5000 non-null   float64
 8   Proximity_to_Industrial_Areas  5000 non-null   float64
 9   Population_Density             5000 non-null   int64  
 10  Air Quality                    5000 non-null   object 
dtypes: float64(8), int64(1), object(2)
memory usage: 429.8+ KB


In [23]:
df = df.rename(columns={'Air Quality': 'AirQuality'})
df = df.rename(columns={'PM2.5': 'PM2_5'})

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   City                           5000 non-null   object 
 1   Temperature                    5000 non-null   float64
 2   Humidity                       5000 non-null   float64
 3   PM2_5                          5000 non-null   float64
 4   PM10                           5000 non-null   float64
 5   NO2                            5000 non-null   float64
 6   SO2                            5000 non-null   float64
 7   CO                             5000 non-null   float64
 8   Proximity_to_Industrial_Areas  5000 non-null   float64
 9   Population_Density             5000 non-null   int64  
 10  AirQuality                     5000 non-null   object 
dtypes: float64(8), int64(1), object(2)
memory usage: 429.8+ KB


In [25]:
df.isnull().sum()

Unnamed: 0,0
City,0
Temperature,0
Humidity,0
PM2_5,0
PM10,0
NO2,0
SO2,0
CO,0
Proximity_to_Industrial_Areas,0
Population_Density,0


In [26]:
df=df.drop(columns='City')

In [27]:
y = df['AirQuality']
x = df.drop('AirQuality', axis=1)

In [28]:
pd.get_dummies(df,columns=['AirQuality'],drop_first=True)

Unnamed: 0,Temperature,Humidity,PM2_5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,AirQuality_Hazardous,AirQuality_Moderate,AirQuality_Poor
0,29.8,59.1,5.2,17.9,18.9,9.2,1.72,6.3,319,False,True,False
1,28.3,75.6,2.3,12.2,30.8,9.7,1.64,6.0,611,False,True,False
2,23.1,74.7,26.7,33.8,24.4,12.6,1.63,5.2,619,False,True,False
3,27.1,39.1,6.1,6.3,13.5,5.3,1.15,11.1,551,False,False,False
4,26.5,70.7,6.9,16.0,21.9,5.6,1.01,12.7,303,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,40.6,74.1,116.0,126.7,45.5,25.7,2.11,2.8,765,True,False,False
4996,28.1,96.9,6.9,25.0,25.3,10.8,1.54,5.7,709,False,True,False
4997,25.9,78.2,14.2,22.1,34.8,7.8,1.63,9.6,379,False,True,False
4998,25.3,44.4,21.4,29.0,23.7,5.7,0.89,11.6,241,False,False,False


In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [30]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [31]:
rf = RandomForestClassifier()
model = rf.fit(x_train, y_train)

In [32]:
model.score(x_test, y_test)

0.959

In [33]:
import pickle

In [34]:
with open('model.pkl', 'wb') as f:
    pickle.dump({
      'model': rf,
      'scaler': scaler},
      f)

In [35]:
pd.DataFrame(x_train).to_csv('testdata.csv',index =False)