In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.nn.functional as F

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler,FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import copy


In [2]:
import os


# Common  columns : Crop,Year

path_to_dataset = "Dataset/Task2Summary.csv"
df = pd.read_csv(path_to_dataset)

# print(df.describe())
df.drop(
    axis=1,
    columns=["districtcode", "statename", "statecode", "year", "crop",'districtname'],
    inplace=True,
)
#
# Define categorical and numerical features

numerical_features = [
    "area1000hectares",
    "production1000tonnes",
    "irrigatedarea1000hectares",
    "nitrogenconsumptiontonnes",
    "phosphateconsumptiontonnes",
    "potashconsumptiontonnes",
    "total_rainfall",
    "average_rainfall",
    "salinity_alkalinity_percent",
]

df["irrigatedarea1000hectares"] = df["irrigatedarea1000hectares"] + 1
# df["unirrigatedarea1000hecatres"] = (
#     df["area1000hectares"] - df["irrigatedarea1000hectares"] + 2
# )


df["salinity_alkalinity_percent"] = df["salinity_alkalinity_percent"] + 1

numerical_transformer = Pipeline(
    steps=[
        ("log", FunctionTransformer(np.log, validate=True))
    ]  # np.log1p to handle zeros safely
)


categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
    ],
    remainder="passthrough",  # Leave numerical features as they are
)


X = preprocessor.fit_transform(df)

# onehot_columns = []
# for feature, categories in zip(
#     categorical_features,
#     preprocessor.transformers_[0][1].named_steps["onehot"].categories_,
# ):
#     for category in categories:
#         onehot_columns.append(f"{feature}_{category}")

numerical_columns = numerical_features 

# Combine both lists
all_columns = numerical_columns

# Now, X is a sparse matrix, so we need to convert it to a dense format to access column names easily
X_dense = X

# Convert dense matrix to a DataFrame with appropriate column names
X_df = pd.DataFrame(X_dense, columns=all_columns)

output_file = "output.txt"
with open(output_file, "w") as f:
    for i in X_df.iloc[0].items():

        f.write(str(i))
        f.write("\n")


X = X_df.drop(axis=1, columns=["production1000tonnes"])
Y = X_df["production1000tonnes"]


# Target (Y)
# Y = df["production1000tonnes"].values.reshape(-1, 1)

# # Convert to torch tensors
# X_tensor = torch.tensor(X, dtype=torch.float32)
# Y_tensor = torch.tensor(Y, dtype=torch.float32)


X_copy,Y_Copy =  copy.deepcopy(X),copy.deepcopy(Y)
X,Y = torch.tensor(X_copy.to_numpy()),torch.tensor(Y_Copy.to_numpy())

In [3]:
class Project_Dataset(Dataset):

    def __init__(self):
        super().__init__()

        raise NotImplemented

    def __len__(self):

        raise NotImplemented

    def __getitem__(self, index):

        raise NotImplemented

    def return_full_batch(self):

        raise NotImplemented

In [9]:
class OLS(nn.Module):

    def __init__(self, num_features, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.num_features = num_features
        self.weights = nn.Parameter(
            torch.randn(num_features, 1)
        )  # shape (num_features, 1)
        self.bias = nn.Parameter(torch.randn(1, 1))

    def forward(self, X):

        return torch.matmul(X, self.weights) + self.bias
    
    
    def calculate_statistics(self,y_prime:torch.tensor,y:torch.tensor):
        SST = ((y-y.mean())**2).sum()
        
        SSR = ((y_prime-y.mean())**2).sum()
        
        SSE =      ((y_prime - y) ** 2).sum()
        
        EXPECTEDVALUE_U = (y-y_prime).sum() 
        
        
        print(SST,SSE,SSR,EXPECTEDVALUE_U)
        return SSR,SSE,SST
    

    def train(self, dataset):

        # Only accepts full batch

        X, Y = dataset

        X_augmented = torch.cat((torch.ones(X.shape[0], 1), X), dim=1)
        X_t = X_augmented.t()  # Transpose of X'
        w = torch.linalg.inv(X_t @ X_augmented) @ X_t @ Y
        print(w.dtype)
        self.weights = nn.Parameter(w[1:].reshape(-1, 1))
        self.bias = nn.Parameter(w[0].reshape(1, 1))
        
        
        self.calculate_statistics(self(X),Y)

In [8]:
model = OLS(9) 

model.train((X,Y))

torch.float64
tensor(623.2786, dtype=torch.float64) tensor(297935.5659, dtype=torch.float64, grad_fn=<SumBackward0>) tensor(545.0962, dtype=torch.float64, grad_fn=<SumBackward0>)


In [6]:
# This is for EDA

FILE_NAME = "dataset.csv"

df = pd.read_csv(FILE_NAME)
print("Head of Dataset:")
print(df.head())

print("Dataset Info:")
print(df.info())

print("Summary Statistics:")
print(df.describe())

print("Missing Values:")
print(df.isnull().sum())

print("Duplicate Rows:", df.duplicated().sum())

if "target" in df.columns:
    print("Target Distribution:")
    print(df["target"].value_counts())
    sns.countplot(data=df, x="target")
    plt.title("Target Class Distribution")
    plt.show()

plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

df.hist(figsize=(12, 8), bins=30)
plt.suptitle("Histogram of Features", fontsize=16)
plt.tight_layout()
plt.show()

numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
for col in numerical_cols:
    plt.figure(figsize=(6, 3))
    sns.boxplot(data=df, x=col)
    plt.title(f"Boxplot: {col}")
    plt.tight_layout()
    plt.show()


cat_cols = df.select_dtypes(include="object").columns.tolist()
for col in cat_cols:
    print(f"Value Counts for {col}:")
    print(df[col].value_counts())
    sns.countplot(data=df, x=col)
    plt.title(f"Distribution of {col}")
    plt.xticks(rotation=45)
    plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'dataset.csv'