In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
solar_flare = fetch_ucirepo(id=89) 
  
# data (as pandas dataframes) 
X = solar_flare.data.features 
y = solar_flare.data.targets 
  
# metadata 
print(solar_flare.metadata) 
  
# variable information 
print(solar_flare.variables) 


{'uci_id': 89, 'name': 'Solar Flare', 'repository_url': 'https://archive.ics.uci.edu/dataset/89/solar+flare', 'data_url': 'https://archive.ics.uci.edu/static/public/89/data.csv', 'abstract': 'Each class attribute counts the number of solar flares of a certain class that occur in a 24 hour period', 'area': 'Physics and Chemistry', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 1389, 'num_features': 10, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['common flares', 'moderate flares', 'severe flares'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1989, 'last_updated': 'Wed Feb 14 2024', 'dataset_doi': '10.24432/C5530G', 'creators': [], 'intro_paper': None, 'additional_info': {'summary': 'Notes:\r\n\r\n   -- The database contains 3 potential classes, one for the number of times a certain type of solar flare occured in a 24 hour period.\r\n   -- Each instance represents captur

In [2]:
X

Unnamed: 0,modified Zurich class,largest spot size,spot distribution,activity,evolution,previous 24 hour flare activity,historically-complex,became complex on this pass,area,area of largest spot
0,C,S,O,1,2,1,1,2,1,2
1,D,S,O,1,3,1,1,2,1,2
2,C,S,O,1,3,1,1,2,1,1
3,D,S,O,1,3,1,1,2,1,2
4,D,A,O,1,3,1,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...
1384,H,S,X,1,2,1,1,1,1,1
1385,H,S,X,2,2,1,1,2,1,1
1386,C,S,O,1,2,1,2,2,1,1
1387,H,R,X,1,2,1,1,2,1,1


In [3]:
y

Unnamed: 0,common flares,moderate flares,severe flares
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
1384,0,0,0
1385,0,0,0
1386,0,0,0
1387,0,0,0


In [11]:
X.isnull()

Unnamed: 0,modified Zurich class,largest spot size,spot distribution,activity,evolution,previous 24 hour flare activity,historically-complex,became complex on this pass,area,area of largest spot
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
1384,False,False,False,False,False,False,False,False,False,False
1385,False,False,False,False,False,False,False,False,False,False
1386,False,False,False,False,False,False,False,False,False,False
1387,False,False,False,False,False,False,False,False,False,False


In [12]:
X.isnull().sum()

modified Zurich class              0
largest spot size                  0
spot distribution                  0
activity                           0
evolution                          0
previous 24 hour flare activity    0
historically-complex               0
became complex on this pass        0
area                               0
area of largest spot               0
dtype: int64

In [14]:
y.isnull().sum().sum()

np.int64(0)

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1389 entries, 0 to 1388
Data columns (total 10 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   modified Zurich class            1389 non-null   object
 1   largest spot size                1389 non-null   object
 2   spot distribution                1389 non-null   object
 3   activity                         1389 non-null   int64 
 4   evolution                        1389 non-null   int64 
 5   previous 24 hour flare activity  1389 non-null   int64 
 6   historically-complex             1389 non-null   int64 
 7   became complex on this pass      1389 non-null   int64 
 8   area                             1389 non-null   int64 
 9   area of largest spot             1389 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 108.6+ KB


In [17]:
from sklearn.preprocessing import LabelEncoder
X.columns

Index(['modified Zurich class', 'largest spot size', 'spot distribution',
       'activity', 'evolution', 'previous 24 hour flare activity',
       'historically-complex', 'became complex on this pass', 'area',
       'area of largest spot'],
      dtype='object')

In [22]:
# Encode categorical features in X using LabelEncoder
le = LabelEncoder()
for col in X.select_dtypes(include='object').columns:
    X[col] = le.fit_transform(X[col])

In [23]:
X

Unnamed: 0,modified Zurich class,largest spot size,spot distribution,activity,evolution,previous 24 hour flare activity,historically-complex,became complex on this pass,area,area of largest spot
0,1,4,2,1,2,1,1,2,1,2
1,2,4,2,1,3,1,1,2,1,2
2,1,4,2,1,3,1,1,2,1,1
3,2,4,2,1,3,1,1,2,1,2
4,2,0,2,1,3,1,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...
1384,5,4,3,1,2,1,1,1,1,1
1385,5,4,3,2,2,1,1,2,1,1
1386,1,4,2,1,2,1,2,2,1,1
1387,5,3,3,1,2,1,1,2,1,1


In [24]:
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [25]:
X_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=42)
model=DecisionTreeRegressor(max_depth=4, random_state=42)
model.fit(X_train,y_train)

In [26]:
y_pred=model.predict(x_test)
mse= mean_squared_error(y_test,y_pred)
print(f"Mean squered Error: {mse:.3f}")

Mean squered Error: 0.296


In [29]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1389 entries, 0 to 1388
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   common flares    1389 non-null   int64
 1   moderate flares  1389 non-null   int64
 2   severe flares    1389 non-null   int64
dtypes: int64(3)
memory usage: 32.7 KB


In [30]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1389 entries, 0 to 1388
Data columns (total 10 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   modified Zurich class            1389 non-null   int64
 1   largest spot size                1389 non-null   int64
 2   spot distribution                1389 non-null   int64
 3   activity                         1389 non-null   int64
 4   evolution                        1389 non-null   int64
 5   previous 24 hour flare activity  1389 non-null   int64
 6   historically-complex             1389 non-null   int64
 7   became complex on this pass      1389 non-null   int64
 8   area                             1389 non-null   int64
 9   area of largest spot             1389 non-null   int64
dtypes: int64(10)
memory usage: 108.6 KB


In [31]:
X.describe()

Unnamed: 0,modified Zurich class,largest spot size,spot distribution,activity,evolution,previous 24 hour flare activity,historically-complex,became complex on this pass,area,area of largest spot
count,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0,1389.0
mean,2.50036,2.951764,1.982721,1.150468,2.421166,1.089993,1.395968,1.892009,1.025918,1.175666
std,1.834517,1.716543,0.815725,0.357658,0.617129,0.403292,0.489234,0.310481,0.158948,0.380673
min,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0
50%,2.0,4.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0
75%,5.0,4.0,3.0,1.0,3.0,1.0,2.0,2.0,1.0,1.0
max,5.0,5.0,3.0,2.0,3.0,3.0,2.0,2.0,2.0,2.0


In [32]:
y.describe()

Unnamed: 0,common flares,moderate flares,severe flares
count,1389.0,1389.0,1389.0
mean,0.261339,0.067675,0.009359
std,0.760201,0.353695,0.103534
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,8.0,5.0,2.0
