<a href="https://colab.research.google.com/github/Kabshah/smithub/blob/main/Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("/content/weather_forecast.csv")
df.shape

(1000, 8)

In [3]:
df.head(1)

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Pressure,Visibility,Day_type,Forecast
0,Rainy,Hot,Normal,Strong,Medium,Good,Weekday,Yes


In [4]:
df.describe(include='all')

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Pressure,Visibility,Day_type,Forecast
count,949,944,956,939,965,953,940,989
unique,14,15,10,10,15,15,10,5
top,Sunny,Hot,Normal,Strong,Low,Moderate,Weekday,Yes
freq,308,297,451,449,332,333,446,597


**2. Data Cleaning**


# Missing/Null Values

In [5]:
df.isnull().sum()

Unnamed: 0,0
Outlook,51
Temperature,56
Humidity,44
Wind,61
Pressure,35
Visibility,47
Day_type,60
Forecast,11


# Data Consistency

In [6]:
data_types = df.dtypes
data_types

Unnamed: 0,0
Outlook,object
Temperature,object
Humidity,object
Wind,object
Pressure,object
Visibility,object
Day_type,object
Forecast,object


In [7]:

# Check for duplicates
duplicates = df.duplicated()
print("\nNumber of Duplicate Rows:", df[duplicates].shape[0])
print(df[duplicates])

df = df.drop_duplicates()


Number of Duplicate Rows: 83
      Outlook Temperature Humidity    Wind Pressure Visibility Day_type  \
62   Overcast        Cool     High    Weak      Low       Poor  Weekend   
72      Sunny        Mild     High  Strong     High   Moderate  Weekday   
95      Rainy         Hot     High    Weak     High   Moderate  Weekend   
122  Overcast        Mild     High  Strong   Medium       Poor  Weekday   
148  Overcast        Cool     High  Strong     High       Poor  Weekend   
..        ...         ...      ...     ...      ...        ...      ...   
973     Rainy        Cool     High    Weak      Low       Poor  Weekend   
976     Rainy        Mild     High    Weak      Low   Moderate  Weekday   
980     Sunny        Mild     High    Weak      Low       Poor  Weekday   
993     Rainy         Hot     High  Strong   Medium   Moderate  Weekday   
995  Overcast         Hot     High    Weak     High       Good  Weekend   

    Forecast  
62       Yes  
72        No  
95     maybe  
122      

In [8]:
# Fill categorical columns with mode (most common value)
categorical_cols = ['Outlook', 'Temperature', 'Humidity', 'Wind', 'Pressure', 'Visibility', 'Day_type', 'Forecast']

for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [9]:
print(df.isnull().sum())
print(df.head())


Outlook        0
Temperature    0
Humidity       0
Wind           0
Pressure       0
Visibility     0
Day_type       0
Forecast       0
dtype: int64
    Outlook Temperature Humidity    Wind Pressure Visibility  Day_type  \
0     Rainy         Hot   Normal  Strong   Medium       Good  Weekday    
1     Rainy         Hot     High    Weak   Medium       Poor   Weekend   
2  Overcast        Mild     High  Strong      Low       Good   Weekend   
3     Rainy         Hot   Normal    Weak     High       Poor   Weekend   
4  overcast        Mild   Normal  Strong   Medium   Moderate   Weekday   

  Forecast  
0      Yes  
1      Yes  
2      Yes  
3      yes  
4      Yes  


In [10]:
# Make all text lowercase for consistency
for col in categorical_cols:
    df[col] = df[col].str.lower()

In [11]:
print(df.head())

    Outlook Temperature Humidity    Wind Pressure Visibility  Day_type  \
0     rainy         hot   normal  strong   medium       good  weekday    
1     rainy         hot     high    weak   medium       poor   weekend   
2  overcast        mild     high  strong      low       good   weekend   
3     rainy         hot   normal    weak     high       poor   weekend   
4  overcast        mild   normal  strong   medium   moderate   weekday   

  Forecast  
0      yes  
1      yes  
2      yes  
3      yes  
4      yes  


# Ordinal mapping for ordered columns

In [12]:
# Ordinal columns mapping
temperature_map = {'cool': 0, 'mild': 1, 'hot': 2}
humidity_map = {'low': 0, 'normal': 1, 'high': 2}
pressure_map = {'low': 0, 'medium': 1, 'high': 2}
visibility_map = {'poor': 0, 'moderate': 1, 'good': 2}

df['Temperature'] = df['Temperature'].map(temperature_map)
df['Humidity'] = df['Humidity'].map(humidity_map)
df['Pressure'] = df['Pressure'].map(pressure_map)
df['Visibility'] = df['Visibility'].map(visibility_map)

# Label encoding for nominal columns

In [13]:
from sklearn.preprocessing import LabelEncoder

label_cols = ['Outlook', 'Wind', 'Day_type', 'Forecast']

le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])


In [14]:
print(df.head())
print(df.dtypes)


   Outlook  Temperature  Humidity  Wind  Pressure  Visibility  Day_type  \
0        4          2.0       1.0     0       1.0         2.0         1   
1        4          2.0       2.0     4       1.0         0.0         4   
2        0          1.0       2.0     0       0.0         2.0         4   
3        4          2.0       1.0     4       2.0         0.0         4   
4        0          1.0       1.0     0       1.0         1.0         0   

   Forecast  
0         2  
1         2  
2         2  
3         2  
4         2  
Outlook          int64
Temperature    float64
Humidity       float64
Wind             int64
Pressure       float64
Visibility     float64
Day_type         int64
Forecast         int64
dtype: object


In [19]:
from sklearn.model_selection import train_test_split

X = df.drop('Forecast', axis=1)  # Features
y = df['Forecast']               # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7165109034267912
Confusion Matrix:
 [[  0   0   2]
 [  2  68  50]
 [  0  37 162]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.65      0.57      0.60       120
           2       0.76      0.81      0.78       199

    accuracy                           0.72       321
   macro avg       0.47      0.46      0.46       321
weighted avg       0.71      0.72      0.71       321



Question#01
Build a weather prediction model using the dataset (weather_forecast.csv) based on meteorological
conditions such as Outlook, Temperature, Humidity, Wind, Pressure, and Visibility. Train and evaluate the
model to determine which Supervised Learning Algorithm performs better in forecasting favorable
weather conditions.

a. Use the given dataset and perform its cleaning to handle missing values, inconsistent
capitalization, and noisy entries.

b. Apply Supervised Learning Algorithm and calculate the accuracy of predicting the Forecast
attribute.

c. Construct a model to classify weather conditions.

d. Identify which attributes (e.g., Outlook, Humidity, or Pressure) have the most influence on
the forecast.
Note: (Attached Screenshot of your solution)

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
df = pd.read_csv("/content/weather_forecast.csv")
categorical_cols = ['Outlook','Temperature','Humidity','Wind','Pressure',
                    'Visibility','Day_type','Forecast']
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])
for col in categorical_cols:
    df[col] = df[col].astype(str).str.lower()
df['Forecast'] = df['Forecast'].replace({
    'y':'yes', 'yeah':'yes', 'yess':'yes', 'yeh':'yes', 'yas':'yes',
    'noo':'no', 'n':'no', 'nope':'no', 'nah':'no',
    'maybe':'no'})
df = df[df['Forecast'].isin(['yes','no'])]
df['Outlook'] = df['Outlook'].replace({
    'sun':'sunny', 'sunnyy':'sunny', 'sunlight':'sunny',
    'rain':'rainy', 'rainyy':'rainy', 'drizzle':'rainy',
    'cloudy':'overcast', 'cloudyy':'overcast'})
df['Visibility'] = df['Visibility'].replace({
    'very poor':'poor','mist':'poor','foggy':'poor',
    'clear':'good','excellent':'good'})
temperature_map = {'cool': 0, 'mild': 1, 'hot': 2}
humidity_map = {'low': 0, 'normal': 1, 'high': 2}
pressure_map  = {'low': 0, 'medium': 1, 'high': 2}
visibility_map = {'poor': 0, 'moderate': 1, 'good': 2}
df['Temperature'] = df['Temperature'].map(temperature_map)
df['Humidity']    = df['Humidity'].map(humidity_map)
df['Pressure']    = df['Pressure'].map(pressure_map)
df['Visibility']  = df['Visibility'].map(visibility_map)
label_cols = ['Outlook','Wind','Day_type']

le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])
df['Forecast'] = df['Forecast'].map({'no':0, 'yes':1})
X = df.drop('Forecast', axis=1)
y = df['Forecast']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy:", f"{accuracy_score(y_test, y_pred)*100:.2f}%")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 80.00%

Confusion Matrix:
 [[ 76  37]
 [ 23 164]]
