<a href="https://colab.research.google.com/github/Mithraeye/-Data-Analyst-Project-Series.-/blob/main/Weather_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the dataset from the provided link
url = "https://drive.google.com/uc?id=1e0XOalmmdQgU6Kx7iTqyzGR7grSbFD0n"
weather_data = pd.read_csv(url)

# Display the first few rows of the dataset to understand its structure
print(weather_data.head())


       Date/Time  Temp_C  Dew Point Temp_C  Rel Hum_%  Wind Speed_km/h  \
0  1/1/2012 0:00    -1.8              -3.9         86                4   
1  1/1/2012 1:00    -1.8              -3.7         87                4   
2  1/1/2012 2:00    -1.8              -3.4         89                7   
3  1/1/2012 3:00    -1.5              -3.2         88                6   
4  1/1/2012 4:00    -1.5              -3.3         88                7   

   Visibility_km  Press_kPa               Weather  
0            8.0     101.24                   Fog  
1            8.0     101.24                   Fog  
2            4.0     101.26  Freezing Drizzle,Fog  
3            4.0     101.27  Freezing Drizzle,Fog  
4            4.8     101.23                   Fog  


In [None]:
# Check for missing values in the dataset
missing_values = weather_data.isnull().sum()
print("Missing Values:")
print(missing_values)

# Handle missing values
# For numeric columns, we can replace missing values with the mean or median
numeric_columns = weather_data.select_dtypes(include=['int64', 'float64']).columns
weather_data[numeric_columns] = weather_data[numeric_columns].fillna(weather_data[numeric_columns].median())

# For categorical columns, we can replace missing values with the mode (most frequent value)
categorical_columns = weather_data.select_dtypes(include=['object']).columns
weather_data[categorical_columns] = weather_data[categorical_columns].fillna(weather_data[categorical_columns].mode().iloc[0])

# Recheck for missing values to confirm they are handled
missing_values_after_handling = weather_data.isnull().sum()
print("\nMissing Values After Handling:")
print(missing_values_after_handling)

Missing Values:
Date/Time           0
Temp_C              0
Dew Point Temp_C    0
Rel Hum_%           0
Wind Speed_km/h     0
Visibility_km       0
Press_kPa           0
Weather             0
dtype: int64

Missing Values After Handling:
Date/Time           0
Temp_C              0
Dew Point Temp_C    0
Rel Hum_%           0
Wind Speed_km/h     0
Visibility_km       0
Press_kPa           0
Weather             0
dtype: int64


In [None]:
# Define a function to detect outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Detect outliers in numeric columns
numeric_columns = weather_data.select_dtypes(include=['int64', 'float64']).columns
outliers_dict = {}
for column in numeric_columns:
    outliers_dict[column] = detect_outliers(weather_data, column)

# Remove outliers
for column, outliers in outliers_dict.items():
    if not outliers.empty:
        print("Outliers detected in column:", column)
        print(outliers)
        # Remove outliers from the dataset
        weather_data = weather_data[~weather_data.index.isin(outliers.index)]

print("\nData after removing outliers:")
print(weather_data.head())


Outliers detected in column: Rel Hum_%
            Date/Time  Temp_C  Dew Point Temp_C  Rel Hum_%  Wind Speed_km/h  \
1984  3/23/2012 16:00    15.5              -9.0         18               22   
3066   5/7/2012 18:00    20.7              -4.4         18               13   

      Visibility_km  Press_kPa        Weather  
1984           48.3      101.6         Cloudy  
3066           48.3      101.0  Mostly Cloudy  
Outliers detected in column: Wind Speed_km/h
             Date/Time  Temp_C  Dew Point Temp_C  Rel Hum_%  Wind Speed_km/h  \
25       1/2/2012 1:00     4.6               0.0         72               39   
32       1/2/2012 8:00     1.9              -3.3         68               39   
33       1/2/2012 9:00     1.8              -3.7         67               44   
34      1/2/2012 10:00     1.5              -4.1         66               43   
36      1/2/2012 12:00     1.7              -6.2         56               48   
...                ...     ...               ...      

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max scaling to numeric columns
weather_data[numeric_columns] = scaler.fit_transform(weather_data[numeric_columns])

# Display the first few rows of the normalized dataset
print("\nNormalized Data:")
print(weather_data.head())



Normalized Data:
         Date/Time    Temp_C  Dew Point Temp_C  Rel Hum_%  Wind Speed_km/h  \
20  1/1/2012 20:00  0.470693          0.563327   0.846154         0.542857   
21  1/1/2012 21:00  0.484902          0.570888   0.820513         0.571429   
23  1/1/2012 23:00  0.507993          0.576560   0.743590         0.857143   
24   1/2/2012 0:00  0.506217          0.567108   0.717949         1.000000   
26   1/2/2012 2:00  0.483126          0.521739   0.641026         0.914286   

    Visibility_km  Press_kPa        Weather  
20            1.0   0.110294         Cloudy  
21            1.0   0.083333         Cloudy  
23            1.0   0.063725         Cloudy  
24            1.0   0.051471   Rain Showers  
26            1.0   0.051471  Mostly Cloudy  
