In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('chennai-monthly-rains.csv')

In [3]:
year_column = df.columns[0]

In [4]:
df[year_column] = df[year_column].astype(int)

In [5]:
for col in df.columns[1:]:
    try:
        df[col] = df[col].astype(float)
    except ValueError:
        print(f"Could not convert column '{col}' to float. Keeping original data type.")

In [6]:
for col in df.columns[1:]:
    df[col] = df[col].replace(0, np.nan)

In [7]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)
    return df

In [8]:
for col in df.columns[1:]:
    df = remove_outliers(df, col)

In [9]:
clean_df = df.copy()
print(clean_df.info())
print(clean_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    121 non-null    int32  
 1   Jan     104 non-null    float64
 2   Feb     59 non-null     float64
 3   Mar     47 non-null     float64
 4   April   76 non-null     float64
 5   May     95 non-null     float64
 6   June    121 non-null    float64
 7   July    121 non-null    float64
 8   Aug     121 non-null    float64
 9   Sept    121 non-null    float64
 10  Oct     121 non-null    float64
 11  Nov     121 non-null    float64
 12  Dec     120 non-null    float64
 13  Total   121 non-null    float64
dtypes: float64(13), int32(1)
memory usage: 12.9 KB
None
   Year        Jan        Feb        Mar      April        May       June  \
0  1901   8.730337  37.404305   0.122438        NaN   0.482331   3.672060   
1  1902   6.882661   0.198963        NaN   0.216180   2.029171  14.067780   
2  1903  51.799

In [10]:
print(clean_df.describe())

              Year         Jan        Feb        Mar      April        May  \
count   121.000000  104.000000  59.000000  47.000000  76.000000  95.000000   
mean   1961.000000   27.588738  11.640470  14.177944  17.270551  30.544245   
std      35.073732   34.385170  13.470721  15.041464  17.981561  33.643919   
min    1901.000000    0.101310   0.114316   0.122438   0.105535   0.138579   
25%    1931.000000    2.654986   1.509111   2.296068   2.566272   5.701253   
50%    1961.000000   11.619703   6.589488   8.538346  10.697957  14.150794   
75%    1991.000000   43.698339  15.867189  20.560482  23.370464  40.645628   
max    2021.000000  105.263369  37.404305  47.957102  54.576751  93.062190   

             June        July         Aug        Sept         Oct         Nov  \
count  121.000000  121.000000  121.000000  121.000000  121.000000  121.000000   
mean    49.061729   85.550406  116.745290  118.943768  264.302811  345.058421   
std     32.394237   47.069174   62.791945   63.301708 

In [11]:
print("\nMissing values:")
print(clean_df.isnull().sum())


Missing values:
Year      0
Jan      17
Feb      62
Mar      74
April    45
May      26
June      0
July      0
Aug       0
Sept      0
Oct       0
Nov       0
Dec       1
Total     0
dtype: int64


In [12]:
clean_df

Unnamed: 0,Year,Jan,Feb,Mar,April,May,June,July,Aug,Sept,Oct,Nov,Dec,Total
0,1901,8.730337,37.404305,0.122438,,0.482331,3.672060,140.931463,191.312288,67.905936,220.755039,311.003928,327.474789,1352.832018
1,1902,6.882661,0.198963,,0.216180,2.029171,14.067780,64.906015,116.023665,116.667332,538.493570,328.385771,187.440381,1375.311487
2,1903,51.799512,17.790745,,,93.062190,28.247849,84.063122,124.490061,228.610041,273.410423,454.908194,366.402779,1741.141398
3,1904,64.476110,,,0.105535,26.258777,26.954648,92.565785,33.578225,96.097558,67.904656,9.128419,62.965906,480.035620
4,1905,39.389811,5.518977,13.637900,20.251322,0.229572,49.718699,47.089597,87.600900,68.931937,535.766217,231.564292,16.087658,1115.786881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,2017,3.412927,,,,14.091811,68.550561,110.649314,274.929851,121.996193,330.933017,538.198694,61.700571,1571.591651
117,2018,0.541746,,2.730167,,,27.999960,49.585058,136.170961,26.489109,171.823354,167.701012,23.988978,607.030347
118,2019,,1.052644,,0.829021,,18.961959,108.368240,170.484346,207.486163,369.642465,97.741268,208.671916,1183.238022
119,2020,44.878906,0.210529,,16.308935,,24.151420,87.049739,75.437341,162.249864,253.001334,438.801018,208.760138,1310.849223


In [13]:
df.to_csv('rainfall_processed_output.csv', index=False)