In [1]:
import pandas as pd
import torch 
from torch.utils.data import DataLoader, TensorDataset, random_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.nn import BCEWithLogitsLoss

In [2]:
device = torch.device("cpu")

In [3]:
device

device(type='cpu')

In [4]:
data = pd.read_csv("Data/train.csv")

In [5]:
if "id" in data.columns and "day" in data.columns:
    
    data.drop(["id", "day"], axis = 1, inplace = True)


In [6]:
X = data.drop("rainfall", axis = 1)
y = data["rainfall"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42, shuffle = True)

# Standardised

In [8]:
X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()

In [9]:
Std = StandardScaler()

X_std = Std.fit_transform(X_train)
X_t_std = Std.transform(X_test)



In [10]:
X_train = torch.tensor(X_std, device = device, dtype = torch.float32)
X_test = torch.tensor(X_t_std, device = device, dtype = torch.float32)


In [11]:
X_train = X_train.T
y_train = y_train.values.reshape(1,-1)

X_test = X_test.T
y_test = y_test.values.reshape(1, -1)

In [12]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")



X_train shape: torch.Size([10, 1752])
y_train shape: (1, 1752)


In [13]:
y_train = torch.tensor(y_train, device = device)

# Weights Initialization

In [17]:
torch.manual_seed(1)

w1 = torch.randn([10, 10], dtype = torch.float32, device = device, requires_grad = True)
b1 = torch.ones([10,1], dtype = torch.float32, device = device, requires_grad = True)

w2 = torch.randn([1, 10], dtype = torch.float32, device = device, requires_grad = True)
b2 = torch.ones([1,1], dtype = torch.float32, device = device, requires_grad = True)


lr = 0.01

for i in range(10000):

    Z1 = torch.matmul(w1, X_train) + b1
    A1 = torch.relu(Z1)

    Z2 = torch.matmul(w2, A1) + b2
    A2 = torch.sigmoid(Z2)


    loss = -torch.mean((y_train*torch.log(A2)) + ((1-y_train)*torch.log(1-A2)))

    loss.backward()

    with torch.no_grad():

        w1 -= lr * w1.grad
        b1 -= lr * b1.grad

        w2 -= lr * w2.grad
        b2 -= lr * b2.grad

    w1.grad.zero_()
    b1.grad.zero_()
    w2.grad.zero_()
    b2.grad.zero_()

    if i%1000 == 0:
        print(f"At Iteration {i}, Loss = {loss}")
    
                       

    
    


At Iteration 0, Loss = 3.4555530548095703
At Iteration 1000, Loss = 0.38670167326927185
At Iteration 2000, Loss = 0.349431574344635
At Iteration 3000, Loss = 0.3385985493659973
At Iteration 4000, Loss = 0.3337803781032562
At Iteration 5000, Loss = 0.33086687326431274
At Iteration 6000, Loss = 0.329024076461792
At Iteration 7000, Loss = 0.3277038037776947
At Iteration 8000, Loss = 0.32659053802490234
At Iteration 9000, Loss = 0.3256551921367645


In [18]:
loss

tensor(0.3248, grad_fn=<NegBackward0>)

# Predictions

In [20]:
X_test.shape

torch.Size([10, 438])

In [22]:
final_Z1 = torch.matmul(w1, X_test) + b1
final_A1 = torch.relu(final_Z1)

final_Z2 = torch.matmul(w2, final_A1) + b2
final_A2 = torch.sigmoid(final_Z2)

In [70]:
final_A2

tensor([[0.8762, 0.7924, 0.6497, 0.8562, 0.9304, 0.9726, 0.9746, 0.9569, 0.6626,
         0.9786, 0.9707, 0.9234, 0.8567, 0.1569, 0.1790, 0.7037, 0.9723, 0.8393,
         0.8108, 0.0775, 0.3654, 0.9935, 0.1729, 0.9085, 0.2419, 0.9879, 0.8847,
         0.8765, 0.8896, 0.9895, 0.9702, 0.9648, 0.8034, 0.1749, 0.0779, 0.9933,
         0.9516, 0.9800, 0.7020, 0.5414, 0.8768, 0.8083, 0.9731, 0.9513, 0.9822,
         0.9869, 0.9253, 0.9910, 0.4020, 0.9511, 0.9557, 0.3307, 0.9278, 0.9770,
         0.8231, 0.3392, 0.9776, 0.9780, 0.9763, 0.7981, 0.3760, 0.9616, 0.8689,
         0.9751, 0.9538, 0.8358, 0.8722, 0.9411, 0.9839, 0.7098, 0.9041, 0.9766,
         0.8821, 0.9615, 0.1135, 0.9489, 0.5465, 0.9860, 0.7438, 0.2140, 0.7708,
         0.9706, 0.4422, 0.9273, 0.9225, 0.9712, 0.9510, 0.7550, 0.9883, 0.7784,
         0.9006, 0.9114, 0.1877, 0.1273, 0.8201, 0.8254, 0.9705, 0.9308, 0.0800,
         0.9537, 0.9257, 0.2265, 0.3959, 0.9307, 0.1775, 0.8325, 0.9229, 0.9722,
         0.9274, 0.9827, 0.0

In [29]:
y_pred = (final_A2 >= 0.5).int()

In [34]:
y_test = torch.tensor(y_test, dtype = torch.float32)

In [41]:
(y_test == y_pred).float().mean().item()

0.8493150472640991

# Kaggle Submission

In [101]:
k_test = pd.read_csv("data/test.csv")

In [102]:
k_test.drop(["id", "day"], axis = 1, inplace = True)

In [103]:
k_test.isnull().sum()

pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        0
dtype: int64

In [104]:
k_test["winddirection"].fillna(k_test["winddirection"].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  k_test["winddirection"].fillna(k_test["winddirection"].mode()[0], inplace=True)


In [105]:
k_test.isnull().sum()

pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
dtype: int64

In [106]:
k_test_std = Std.transform(k_test)

In [107]:
k_test = torch.tensor(k_test_std, device = device, dtype = torch.float32)

In [108]:
k_test = k_test.T

## Predictions

In [109]:
test_Z1 = torch.matmul(w1, k_test) + b1
test_A1 = torch.relu(test_Z1)

test_Z2 = torch.matmul(w2, test_A1) + b2
test_A2 = torch.sigmoid(test_Z2)


In [110]:
y_test_pred = (test_A2 >= 0.5).int()

In [111]:
y_test_pred

tensor([[1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
         1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
         1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
         1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
         1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
         1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
         1, 0, 1, 1, 1, 1, 1

In [112]:
y_test_prob = test_A2.squeeze().cpu().detach().numpy()  # Convert to NumPy for CSV


In [113]:
y_test_prob

array([9.78946924e-01, 9.66228664e-01, 9.25831139e-01, 1.18769847e-01,
       2.96779722e-01, 7.91357815e-01, 9.32086289e-01, 9.34217930e-01,
       9.30172086e-01, 6.94446206e-01, 9.74261642e-01, 9.25381109e-02,
       9.60261941e-01, 9.67502236e-01, 4.92137283e-01, 4.41239821e-03,
       8.77035916e-01, 7.32620180e-01, 9.94661376e-02, 4.24453057e-03,
       2.16954723e-01, 2.99595386e-01, 8.18579912e-01, 9.88265514e-01,
       9.61277008e-01, 3.88935596e-01, 1.44925024e-02, 9.87611115e-01,
       9.63293791e-01, 4.83363420e-01, 9.53551292e-01, 9.74236846e-01,
       8.96574080e-01, 9.27508533e-01, 9.38289165e-01, 9.35344160e-01,
       2.29264647e-01, 8.98386657e-01, 8.02524626e-01, 8.54342103e-01,
       9.05019641e-01, 9.53528106e-01, 1.30268887e-01, 9.17865753e-01,
       9.27458167e-01, 2.22244546e-01, 1.83589190e-01, 9.58257675e-01,
       2.85871387e-01, 8.39380383e-01, 9.31019247e-01, 9.75835979e-01,
       9.79727745e-01, 9.65153337e-01, 9.80147123e-01, 9.45229352e-01,
      

In [115]:
k_test = pd.read_csv("data/test.csv")

# Ensure y_test_prob is a NumPy array and has correct shape
y_test_prob = y_test_prob.flatten()  # Flatten to ensure it's 1D

# Create submission DataFrame
submission = pd.DataFrame({
    "id": k_test["id"],  # Use the row index as ID
    "rainfall": y_test_prob  # Use predicted probabilities
})

# Save the submission file
submission.to_csv("submission.csv", index=False)

print("✅ Submission file saved as 'submission.csv'. Ready for Kaggle upload! 🎯")

✅ Submission file saved as 'submission.csv'. Ready for Kaggle upload! 🎯


In [80]:
k_test.isnull().sum()

id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        0
dtype: int64

In [81]:
k_test.head(10)

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
1,2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2,2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
3,2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
4,2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4
5,2195,6,1027.1,15.6,12.6,11.5,9.0,76.0,94.0,0.0,20.0,41.4
6,2196,7,1022.6,15.5,13.7,10.7,11.8,79.0,95.0,0.0,20.0,43.1
7,2197,8,1013.5,20.5,16.2,15.2,13.1,94.0,93.0,0.2,70.0,41.3
8,2198,9,1021.3,16.3,13.2,11.3,10.8,85.0,99.0,0.1,20.0,34.0
9,2199,10,1026.1,10.4,8.5,7.0,3.1,69.0,88.0,0.0,20.0,26.4


In [83]:
k_test["winddirection"].mode()[0]

np.float64(70.0)