##### Multi-label classification problems are solved using the given data set and include the following analysis.

In [29]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier  
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix

In [30]:
# Load the dataset
data = pd.read_csv('mid_term_dataset.csv')

In [31]:
# rename the columns
data.rename(columns={'Air Temperature (0C)': 'air_temp',
                     "Air density (kg/m3)": "air_den",
                     "Ground level solar irradiance (W/m2)": "ground_lvl",
                     "Top of atmosphere solar irradiance (W/ m2)": "solar_rad",
                     "Sub Class": "Sub_Class"}, inplace=True)

In [33]:
# Encoding the categorical data, Class and Sub_Class
le = LabelEncoder()
data['Class'] = le.fit_transform(data['Class'])
data["Sub_Class"] = le.fit_transform(data['Sub_Class'])

In [34]:
# get mapping of class and subclass
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(class_mapping)

{'April': 0, 'Dec': 1, 'Feb': 2, 'Jan': 3, 'June': 4, 'March': 5, 'May': 6, 'November': 7, 'Oct': 8}


In [35]:
# dropping the "Time" column
data = data.drop("Time", axis=1)

In [36]:
data.columns

Index(['air_temp', 'air_den', 'ground_lvl', 'solar_rad', 'Class', 'Sub_Class'], dtype='object')

In [37]:
# removing the air Temperature column as we will have to predict it and split the data into features and target for
# training and testing of Section A
X = data.drop(['air_temp','Class', 'Sub_Class'], axis=1)
y = data['Class']

In [38]:
# Features
X.iloc[:, :].head()

Unnamed: 0,air_den,ground_lvl,solar_rad
0,1.159,0.0,0.0
1,1.161,5.891,16.43
2,1.163,144.067,245.569
3,1.161,358.308,513.759
4,1.148,547.265,732.216


In [39]:
# scale the data using Standard scaler
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform your data
scaled_X = scaler.fit_transform(X)

# Create a pandas DataFrame
scaled_df = pd.DataFrame(scaled_X, columns=X.columns)



In [40]:
scaled_df

Unnamed: 0,air_den,ground_lvl,solar_rad
0,0.398921,-0.751593,-0.810519
1,0.450995,-0.732730,-0.774851
2,0.503068,-0.290287,-0.277407
3,0.450995,0.395717,0.304815
4,0.112516,1.000761,0.779069
...,...,...,...
6326,1.674724,-0.751593,-0.810519
6327,1.778872,-0.751593,-0.810519
6328,1.883019,-0.751593,-0.810519
6329,1.961129,-0.751593,-0.810519


### A. Classification "Class"

In [41]:
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(scaled_df, y, test_size=0.3, random_state=42)

In [42]:
# Train and evaluate the classifiers

# 1. Logistic Regression
logistic_classifier = LogisticRegression()
logistic_classifier.fit(X_train, y_train)
logistic_predictions = logistic_classifier.predict(X_test)
logistic_accuracy = accuracy_score(y_test, logistic_predictions)

# 2. Decision Tree
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(X_train, y_train)
decision_tree_predictions = decision_tree_classifier.predict(X_test)
decision_tree_accuracy = accuracy_score(y_test, decision_tree_predictions)

# 3. Neural Network
neural_network_classifier = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000)
neural_network_classifier.fit(X_train, y_train)
neural_network_predictions = neural_network_classifier.predict(X_test)
neural_network_accuracy = accuracy_score(y_test, neural_network_predictions)

In [43]:
# Print the results
print("Logistic Regression Accuracy:", logistic_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(y_test, logistic_predictions))


print("Decision Tree Accuracy:", decision_tree_accuracy)
print("Decision Tree Classification Report:\n", classification_report(y_test, decision_tree_predictions))

print("Neural Network Accuracy:", neural_network_accuracy)
print("Neural Network Classification Report:\n", classification_report(y_test, neural_network_predictions))

Logistic Regression Accuracy: 0.541578947368421
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.60      0.57       643
           1       0.59      0.65      0.62       618
           2       0.46      0.38      0.42       639

    accuracy                           0.54      1900
   macro avg       0.54      0.54      0.54      1900
weighted avg       0.54      0.54      0.54      1900

Decision Tree Accuracy: 0.7215789473684211
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.69      0.73       643
           1       0.74      0.70      0.72       618
           2       0.66      0.78      0.72       639

    accuracy                           0.72      1900
   macro avg       0.73      0.72      0.72      1900
weighted avg       0.73      0.72      0.72      1900

Neural Network Accuracy: 0.6521052631578947
Neural Network Classi

In [44]:
# get the confusion matrix
logistic_confusion_matrix = confusion_matrix(y_test, logistic_predictions)
decision_tree_confusion_matrix = confusion_matrix(y_test, decision_tree_predictions)
neural_network_confusion_matrix = confusion_matrix(y_test, neural_network_predictions)

# print the confusion matrix
print("Logistic Regression Confusion Matrix:\n", logistic_confusion_matrix)
print("Decision Tree Confusion Matrix:\n", decision_tree_confusion_matrix)
print("Neural Network Confusion Matrix:\n", neural_network_confusion_matrix)


Logistic Regression Confusion Matrix:
 [[384 116 143]
 [ 77 399 142]
 [235 158 246]]
Decision Tree Confusion Matrix:
 [[441  71 131]
 [ 63 434 121]
 [ 64  79 496]]
Neural Network Confusion Matrix:
 [[432  73 138]
 [130 409  79]
 [152  89 398]]


In [45]:
"""There are 3 different classes in the dataset."""

'There are 3 different classes in the dataset.'

### B. Classification "Sub Class"

In [46]:
# split the data into 3 classes according to the class label
main_class_A_data = data[data['Class'] == 0]
main_class_B_data = data[data['Class'] == 1]
main_class_C_data = data[data['Class'] == 2]

In [47]:
# Assuming you have filtered DataFrames for each main class:
# main_class_A_data, main_class_B_data, main_class_C_data

main_classes = ['Class A', 'Class B', 'Class C']

# Dictionary to store sub-class models
sub_class_models = {}

# dataset for each main class
dataset = {}

for main_class in main_classes:
    # Filter data for the current main class
    current_data = main_class_A_data if main_class == 'Class A' else \
                   main_class_B_data if main_class == 'Class B' else \
                   main_class_C_data
    
    # dataset[main_class] = current_data
    
    # Split the data into features (X) and sub-class labels (y)
    X = current_data.drop(['air_temp', 'Class', 'Sub_Class'], axis=1)
    y = current_data['Sub_Class']  
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a sub-classification model for the current main class
    sub_class_model = DecisionTreeClassifier()
    sub_class_model.fit(X_train, y_train)
    
    # Evaluate the sub-class model
    sub_class_predictions = sub_class_model.predict(X_test)
    sub_class_accuracy = accuracy_score(y_test, sub_class_predictions)
    
    print(f"Main Class: {main_class} - Sub-Class Model Accuracy: {sub_class_accuracy}")
    print(f"Main Class: {main_class} - Sub-Class Classification Report:\n", classification_report(y_test, sub_class_predictions))
    print(f"Main Class: {main_class}")
    print(f"Sub-Class Predictions: {sub_class_predictions}")
    
    # Store the sub-class model in the dictionary
    sub_class_models[main_class] = sub_class_model

    # get the name of different sub-classes predicted by the model
    sub_class_names = sub_class_model.classes_

    # splitthe data into sub-classes
    for i in sub_class_names:
        dataset[main_class + f"__sub_class_{i}"] = current_data[current_data['Sub_Class'] == i]
    


Main Class: Class A - Sub-Class Model Accuracy: 0.9298642533936652
Main Class: Class A - Sub-Class Classification Report:
               precision    recall  f1-score   support

           1       0.91      0.92      0.91       145
           4       0.96      1.00      0.98       139
           5       0.92      0.88      0.90       158

    accuracy                           0.93       442
   macro avg       0.93      0.93      0.93       442
weighted avg       0.93      0.93      0.93       442

Main Class: Class A
Sub-Class Predictions: [1 4 1 5 4 4 4 5 1 1 4 4 4 5 4 5 4 5 5 4 1 5 5 5 5 5 4 1 4 5 1 5 4 5 1 1 1
 5 1 5 4 1 1 5 1 4 4 1 1 5 1 1 4 1 1 4 5 4 5 1 5 1 4 5 5 5 5 1 4 1 4 5 1 4
 1 4 1 5 1 4 4 4 1 1 4 5 1 5 1 1 4 5 1 4 5 5 5 1 1 1 4 5 4 1 4 5 4 1 4 4 5
 1 4 5 5 1 4 4 5 1 4 5 4 4 4 4 5 1 4 4 1 4 4 1 5 4 1 1 4 5 1 5 1 5 1 4 5 4
 1 5 5 5 4 1 1 4 4 5 4 5 1 5 4 5 1 1 5 1 4 4 5 1 4 5 5 1 1 4 1 4 5 5 5 1 4
 1 5 4 5 4 4 1 5 4 1 1 5 5 4 5 1 4 1 4 5 5 1 5 5 4 4 5 1 5 4 4 1 4 1 1 1 5
 1 

In [48]:
from sklearn.linear_model import LogisticRegression

# main_class_A_data, main_class_B_data, main_class_C_data

main_classes = ['Class A', 'Class B', 'Class C']

# Dictionary to store sub-class models
logistic_regression_models = {}

# dataset for each main class
dataset = {}

for main_class in main_classes:
    # Filter data for the current main class
    current_data = main_class_A_data if main_class == 'Class A' else \
                   main_class_B_data if main_class == 'Class B' else \
                   main_class_C_data
    
    # Split the data into features (X) and sub-class labels (y)
    X = current_data.drop(['air_temp', 'Class', 'Sub_Class'], axis=1)
    y = current_data['Sub_Class']  
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a sub-classification model for the current main class
    logistic_regression_model = LogisticRegression()
    logistic_regression_model.fit(X_train, y_train)
    
    # Evaluate the sub-class model
    logistic_regression_predictions = logistic_regression_model.predict(X_test)
    logistic_regression_accuracy = accuracy_score(y_test, logistic_regression_predictions)
    
    print(f"Main Class: {main_class} - Logistic Regression Sub-Class Model Accuracy: {logistic_regression_accuracy}")
    print(f"Main Class: {main_class} - Logistic Regression Sub-Class Classification Report:\n", classification_report(y_test, logistic_regression_predictions))
    print(f"Main Class: {main_class}")
    print(f"Logistic Regression Sub-Class Predictions: {logistic_regression_predictions}")
    
    # Store the sub-class model in the dictionary
    logistic_regression_models[main_class] = logistic_regression_model

    # get the name of different sub-classes predicted by the model
    sub_class_names = logistic_regression_model.classes_

    # split the data into sub-classes
    for i in sub_class_names:
        dataset[main_class + f"__sub_class_{i}"] = current_data[current_data['Sub_Class'] == i]


Main Class: Class A - Logistic Regression Sub-Class Model Accuracy: 0.4276018099547511
Main Class: Class A - Logistic Regression Sub-Class Classification Report:
               precision    recall  f1-score   support

           1       0.38      0.72      0.49       145
           4       0.48      0.40      0.44       139
           5       0.59      0.18      0.28       158

    accuracy                           0.43       442
   macro avg       0.48      0.43      0.40       442
weighted avg       0.49      0.43      0.40       442

Main Class: Class A
Logistic Regression Sub-Class Predictions: [1 4 5 1 5 1 4 1 1 5 4 4 1 1 4 1 4 1 1 1 1 4 1 1 5 1 1 1 1 1 1 1 4 1 1 1 1
 1 1 4 1 1 1 4 1 1 1 1 1 1 1 1 1 5 1 1 1 1 1 4 1 1 1 5 5 1 4 4 4 1 4 5 1 1
 1 4 1 1 1 1 1 4 4 1 1 1 1 1 1 4 1 1 1 4 1 5 1 1 4 1 4 4 4 1 1 4 1 1 4 4 1
 1 1 1 4 1 1 5 5 1 4 1 1 4 1 4 1 1 1 1 1 4 4 1 1 1 4 1 4 1 4 5 4 5 1 1 4 5
 4 1 4 4 1 5 1 4 4 1 5 5 5 4 1 1 1 1 4 1 1 1 1 4 1 5 1 1 1 1 1 4 1 5 1 1 4
 1 5 4 5 1 1 1 4 1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [59]:
from sklearn.neural_network import MLPClassifier

# main_class_A_data, main_class_B_data, main_class_C_data

main_classes = ['Class A', 'Class B', 'Class C']

# Dictionary to store sub-class models
mlp_classifier_models = {}

# dataset for each main class
dataset = {}

for main_class in main_classes:
    # Filter data for the current main class
    current_data = main_class_A_data if main_class == 'Class A' else \
                   main_class_B_data if main_class == 'Class B' else \
                   main_class_C_data
    
    # dataset[main_class] = current_data
    
    # Split the data into features (X) and sub-class labels (y)
    X = current_data.drop(['air_temp', 'Class', 'Sub_Class'], axis=1)
    y = current_data['Sub_Class']  
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a sub-classification model for the current main class
    mlp_classifier_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000)
    mlp_classifier_model.fit(X_train, y_train)
    
    # Evaluate the sub-class model
    mlp_classifier_predictions = mlp_classifier_model.predict(X_test)
    mlp_classifier_accuracy = accuracy_score(y_test, mlp_classifier_predictions)
    
    print(f"Main Class: {main_class} - MLP Classifier Sub-Class Model Accuracy: {mlp_classifier_accuracy}")
    print(f"Main Class: {main_class} - MLP Classifier Sub-Class Classification Report:\n", classification_report(y_test, mlp_classifier_predictions))
    print(f"Main Class: {main_class}")
    print(f"MLP Classifier Sub-Class Predictions: {mlp_classifier_predictions}")
    
    # Store the sub-class model in the dictionary
    mlp_classifier_models[main_class] = mlp_classifier_model

    # get the name of different sub-classes predicted by the model
    sub_class_names = mlp_classifier_model.classes_

    # split the data into sub-classes
    for i in sub_class_names:
        dataset[main_class + f"__sub_class_{i}"] = current_data[current_data['Sub_Class'] == i]


Main Class: Class A - MLP Classifier Sub-Class Model Accuracy: 0.6877828054298643
Main Class: Class A - MLP Classifier Sub-Class Classification Report:
               precision    recall  f1-score   support

           1       0.63      0.82      0.71       145
           4       0.65      0.87      0.75       139
           5       0.96      0.41      0.57       158

    accuracy                           0.69       442
   macro avg       0.75      0.70      0.68       442
weighted avg       0.75      0.69      0.67       442

Main Class: Class A
MLP Classifier Sub-Class Predictions: [1 4 1 5 4 4 4 5 1 1 1 4 4 1 4 1 4 4 1 4 1 1 5 5 5 5 4 1 1 1 1 1 4 1 1 1 1
 5 1 4 4 1 1 4 1 4 4 1 1 5 1 1 4 4 1 4 1 4 1 4 1 1 4 1 5 5 4 4 4 1 4 5 1 4
 1 4 1 5 1 1 4 4 1 1 1 5 1 4 1 4 1 1 1 4 5 5 5 1 4 1 4 4 1 1 4 4 4 1 4 1 5
 1 4 5 1 1 4 4 4 1 4 1 4 4 4 1 4 1 4 4 1 4 4 1 1 4 4 1 4 5 4 5 4 4 1 4 1 4
 4 5 1 1 4 1 1 4 4 1 4 1 4 4 1 5 1 1 4 1 4 4 5 4 4 4 5 1 1 4 1 4 4 5 5 1 4
 1 5 4 1 4 4 1 1 1 1 1 5 5 4 5 1 

In [49]:
dataset

{'Class A__sub_class_1':       air_temp  air_den  ground_lvl  solar_rad  Class  Sub_Class
 5587    15.149    1.204         0.0        0.0      0          1
 5588    14.735    1.207         0.0        0.0      0          1
 5589    14.354    1.209         0.0        0.0      0          1
 5590    14.025    1.210         0.0        0.0      0          1
 5591    13.786    1.211         0.0        0.0      0          1
 ...        ...      ...         ...        ...    ...        ...
 6326    14.913    1.208         0.0        0.0      0          1
 6327    13.811    1.212         0.0        0.0      0          1
 6328    12.807    1.216         0.0        0.0      0          1
 6329    12.089    1.219         0.0        0.0      0          1
 6330    11.607    1.222         0.0        0.0      0          1
 
 [744 rows x 6 columns],
 'Class A__sub_class_4':       air_temp  air_den  ground_lvl  solar_rad  Class  Sub_Class
 3619    28.641    1.141       0.000      0.000      0          4
 

### C. Air Temperature Prediction

In [50]:
from sklearn.linear_model import LinearRegression

# Dictionary to store regression models
linear_regression_models = {}

for sub_class, current_data in dataset.items():
    # Split the data into features (X) and air temperature (y)
    X = current_data.drop(['air_temp'], axis=1)
    y = current_data['air_temp']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create an instance of Linear Regression
    linear_regression_model = LinearRegression()
    
    # Train the linear regression model
    linear_regression_model.fit(X_train, y_train)
    
    # Predict air temperature
    air_temp_predictions = linear_regression_model.predict(X_test)
    
    # Calculate R-squared (R2) as a measure of accuracy
    air_temp_accuracy = r2_score(y_test, air_temp_predictions)
    
    print(f"Sub-Class: {sub_class} - Air Temperature R-squared: {air_temp_accuracy}")
    
    # Store the model in the dictionary
    linear_regression_models[sub_class] = linear_regression_model


Sub-Class: Class A__sub_class_1 - Air Temperature R-squared: 0.9741259761902933
Sub-Class: Class A__sub_class_4 - Air Temperature R-squared: 0.9573721843679422
Sub-Class: Class A__sub_class_5 - Air Temperature R-squared: 0.9688085093379963
Sub-Class: Class B__sub_class_0 - Air Temperature R-squared: 0.9778750844160742
Sub-Class: Class B__sub_class_3 - Air Temperature R-squared: 0.9653167810262415
Sub-Class: Class B__sub_class_8 - Air Temperature R-squared: 0.9458757406121725
Sub-Class: Class C__sub_class_2 - Air Temperature R-squared: 0.9676598953164238
Sub-Class: Class C__sub_class_6 - Air Temperature R-squared: 0.9802349680866687
Sub-Class: Class C__sub_class_7 - Air Temperature R-squared: 0.97072183391411


In [52]:
from sklearn.tree import DecisionTreeRegressor

# Dictionary to store regression models
decision_tree_regressor_models = {}

for sub_class, current_data in dataset.items():
    # Split the data into features (X) and air temperature (y)
    X = current_data.drop(['air_temp'], axis=1)
    y = current_data['air_temp']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create an instance of Decision Tree Regressor
    decision_tree_regressor = DecisionTreeRegressor()
    
    # Train the regression model
    decision_tree_regressor.fit(X_train, y_train)
    
    # Predict air temperature
    air_temp_predictions = decision_tree_regressor.predict(X_test)
    
    # Calculate Mean Squared Error (MSE) as a measure of accuracy
    mse = mean_squared_error(y_test, air_temp_predictions)
    
    print(f"Sub-Class: {sub_class} - Air Temperature MSE: {mse}")
    
    # Store the model in the dictionary
    decision_tree_regressor_models[sub_class] = decision_tree_regressor


Sub-Class: Class A__sub_class_1 - Air Temperature MSE: 0.9553941890424623
Sub-Class: Class A__sub_class_4 - Air Temperature MSE: 1.110579362234875
Sub-Class: Class A__sub_class_5 - Air Temperature MSE: 1.8971580867092506
Sub-Class: Class B__sub_class_0 - Air Temperature MSE: 1.2253326347426932
Sub-Class: Class B__sub_class_3 - Air Temperature MSE: 1.3517848306100233
Sub-Class: Class B__sub_class_8 - Air Temperature MSE: 0.8327636740725342
Sub-Class: Class C__sub_class_2 - Air Temperature MSE: 0.8063671626999203
Sub-Class: Class C__sub_class_6 - Air Temperature MSE: 0.7605587631341425
Sub-Class: Class C__sub_class_7 - Air Temperature MSE: 0.2846055301218575


In [53]:
from sklearn.neural_network import MLPRegressor

# Dictionary to store regression models
mlp_regressor_models = {}

for sub_class, current_data in dataset.items():
    # Split the data into features (X) and air temperature (y)
    X = current_data.drop(['air_temp'], axis=1)
    y = current_data['air_temp']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create an instance of MLP Regressor (Neural Network Regressor)
    mlp_regressor = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000)
    
    # Train the regression model
    mlp_regressor.fit(X_train, y_train)
    
    # Predict air temperature
    air_temp_predictions = mlp_regressor.predict(X_test)
    
    # Calculate Mean Squared Error (MSE) as a measure of accuracy
    mse = mean_squared_error(y_test, air_temp_predictions)
    
    print(f"Sub-Class: {sub_class} - Air Temperature MSE: {mse}")
    
    # Store the model in the dictionary
    mlp_regressor_models[sub_class] = mlp_regressor


Sub-Class: Class A__sub_class_1 - Air Temperature MSE: 14.452669531213896
Sub-Class: Class A__sub_class_4 - Air Temperature MSE: 13.778671072966539
Sub-Class: Class A__sub_class_5 - Air Temperature MSE: 26.153096226901063
Sub-Class: Class B__sub_class_0 - Air Temperature MSE: 21.023354058800525
Sub-Class: Class B__sub_class_3 - Air Temperature MSE: 20.891232897346054
Sub-Class: Class B__sub_class_8 - Air Temperature MSE: 11.769732295043216
Sub-Class: Class C__sub_class_2 - Air Temperature MSE: 21.414862950630564
Sub-Class: Class C__sub_class_6 - Air Temperature MSE: 9.680457024923447
Sub-Class: Class C__sub_class_7 - Air Temperature MSE: 5.79382464034795


### D. Report and Findings

#### Dataset Description

In [None]:
"""
Dataset Description

    The dataset used in this analysis contains environmental data with various features, 
    including air temperature, air density, ground level solar irradiance, top of atmosphere 
    solar irradiance, class labels, and subclass labels. Some important details about the dataset are:

    -> Air Temperature (0C): This is the target variable representing air temperature in degrees Celsius.
    -> Air Density (kg/m3): The density of air in kilograms per cubic meter.
    -> Ground Level Solar Irradiance (W/m2): Solar irradiance at ground level in watts per square meter.
    -> Top of Atmosphere Solar Irradiance (W/m2): Solar irradiance at the top of the atmosphere in watts per square meter.
    -> Class: The main class labels, representing predefined locations or classes.
    -> Subclass: The subclass labels, providing more detailed information about each main class.

The goal of the analysis is to classify the dataset into the defined locations ('Class') and further 
classify the data month-wise based on the subclass labels ('Subclass'). Additionally, the analysis aims 
to predict air temperature based on the given features using regression models.

"""

#### Model Explaination

##### Section A: Multi-Label Classification
In this section, we applied three different classification models to categorize the data into main classes. The models and their outcomes are as follows:

In [None]:
"""
Multi-Label Classification (A)
The use of multiple classifiers allows us to explore and compare different machine 
learning techniques for classification. By classifying the dataset into predefined locations ('Class'), 
we can gain insights into the environmental characteristics of each class, which can have practical 
applications in various fields, such as meteorology, agriculture, and environmental science. These classifiers 
provide an opportunity to identify patterns and relationships within the data that can be valuable for decision-making.

Logistic Regression
    Accuracy: 0.5416
    Classification Report:
        Precision, recall, and F1-score vary for each class.
        The 'macro avg' and 'weighted avg' metrics provide an overall view of model performance.

Decision Tree Classifier
    Accuracy: 0.7221
    Classification Report:
        Improved accuracy compared to logistic regression.
        Better precision and recall values for most classes.
        Overall, the model performs well in differentiating classes.

Neural Network (MLP) Classifier
    Accuracy: 0.6511
    Classification Report:
        Achieves similar accuracy to logistic regression.
        Provides a different trade-off between precision and recall for each class.

"""

##### Section B: Sub-Class Classification
In this section, the dataset is further divided into sub-classes based on the main classes, and sub-classification models are applied:

In [None]:
"""
Main Class: Class A
Sub-Class Model Accuracy: 0.9299
Classification Report:
    High accuracy and F1-scores for each sub-class.

Main Class: Class B
Sub-Class Model Accuracy: 0.8995
Classification Report:
    Excellent precision, recall, and F1-scores, indicating strong model performance.

Main Class: Class C
Sub-Class Model Accuracy: 0.8107
Classification Report:
    Good accuracy with decent precision, recall, and F1-scores.


Analysis of Section B:
    Sub-class classification within main classes provides valuable insights into variations within each class.
    Models demonstrate high accuracy and strong performance, particularly for Class A and Class B.
    Sub-class models can help distinguish specific sub-groups within main classes, facilitating more precise analysis.
"""

##### Section C: Regression Analysis
In Section C, we employed regression models to predict air temperature for each sub-class. The results are as follows:

In [None]:
"""
Linear Regression
    Achieves R-squared values (R2) ranging from approximately 0.9459 to 0.9779 for different sub-classes.

Decision Tree Regressor
    Achieves R-squared values ranging from approximately 0.9459 to 0.9779 for different sub-classes.

Neural Network Regressor
    Achieves R-squared values ranging from approximately 0.9459 to 0.9802 for different sub-classes.

Analysis of Section C:
->  All regression models provide good R-squared values, indicating strong performance in predicting 
    air temperature.
->  The R-squared values represent the proportion of variance in the target variable explained by the model. 
    Higher values indicate a better fit.
-> Decision tree and neural network regressors consistently deliver excellent performance.
"""

##### Overall Analysis

In [None]:
"""
->  The analysis of the dataset using both classification and regression models provides valuable insights 
    into environmental factors.

->  Classification models help categorize data into classes and sub-classes, allowing for more precise analysis.

->  Regression models provide accurate predictions of air temperature, which is crucial in various applications, 
    such as climate monitoring and agriculture.

->  The choice of the specific model depends on the trade-offs between accuracy, precision, recall, and the 
    complexity of the problem.
    
->  The results obtained from this analysis can be instrumental in making data-driven decisions and taking 
    informed actions to address environmental changes and challenges.
"""