In [34]:
import numpy as np
from numpy import isnan
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import plotly.figure_factory as ff
import plotly.express as px

In [35]:
df = pd.read_csv("Autism-Adult-Data1.csv")
df = df.iloc[: , 1:]

KNN IMPUTER

In [36]:
X = df.drop(['Class/ASD'], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 609 entries, 0 to 608
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   id               609 non-null    int64
 1   A1_Score         609 non-null    int64
 2   A2_Score         609 non-null    int64
 3   A3_Score         609 non-null    int64
 4   A4_Score         609 non-null    int64
 5   A5_Score         609 non-null    int64
 6   A6_Score         609 non-null    int64
 7   A7_Score         609 non-null    int64
 8   A8_Score         609 non-null    int64
 9   A9_Score         609 non-null    int64
 10  A10_Score        609 non-null    int64
 11  age              609 non-null    int64
 12  gender           609 non-null    int64
 13  ethnicity        609 non-null    int64
 14  jundice          609 non-null    int64
 15  austim           609 non-null    int64
 16  contry_of_res    609 non-null    int64
 17  used_app_before  609 non-null    int64
 18  result    

In [37]:
y = df['Class/ASD']

In [38]:
X

Unnamed: 0,id,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,...,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation
0,1,1,1,1,1,0,0,1,1,0,...,9,0,9,0,0,57,0,6,0,4
1,2,1,1,0,1,0,0,0,1,0,...,7,1,3,0,1,11,0,5,0,4
2,3,1,1,0,1,1,0,1,1,1,...,10,1,3,1,1,49,0,8,0,2
3,4,1,1,0,1,0,0,1,1,0,...,18,0,9,0,1,57,0,6,0,4
4,6,1,1,1,1,1,0,1,1,1,...,19,1,5,1,0,57,0,9,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,699,1,1,1,1,1,1,1,1,1,...,10,0,6,0,0,5,0,10,0,4
605,700,0,1,0,1,1,0,1,1,1,...,8,0,9,0,0,44,0,7,0,4
606,701,1,0,0,0,0,0,0,1,0,...,17,1,2,0,0,33,0,3,0,2
607,703,1,0,0,1,1,0,1,0,1,...,18,1,7,0,0,40,0,6,0,4


In [39]:
y

0      0
1      0
2      1
3      0
4      1
      ..
604    1
605    1
606    0
607    0
608    1
Name: Class/ASD, Length: 609, dtype: int64

In [40]:
def knn_imputer(X, k=5):
    Xtrans = X.copy().to_numpy()

    for i in range(X.shape[0]):
        missing_idx = np.where(np.isnan(Xtrans[i]))[0]
        if missing_idx.size == 0:
            continue
        for idx in missing_idx:
            non_nan_idx = np.where(~np.isnan(Xtrans[:, idx]))[0]
            # Calculate Euclidean distances
            distances = np.linalg.norm(Xtrans[non_nan_idx, :] - Xtrans[i, :], axis=1)
            # Find k nearest neighbors
            nearest_idx = non_nan_idx[np.argsort(distances)[:k]]
            # Impute missing value with mean of k nearest neighbors
            Xtrans[i, idx] = np.nanmean(Xtrans[nearest_idx, idx])

    return Xtrans

In [41]:
Xtrans = knn_imputer(X)

In [42]:
Xtrans

array([[  1,   1,   1, ...,   6,   0,   4],
       [  2,   1,   1, ...,   5,   0,   4],
       [  3,   1,   1, ...,   8,   0,   2],
       ...,
       [701,   1,   0, ...,   3,   0,   2],
       [703,   1,   0, ...,   6,   0,   4],
       [704,   1,   0, ...,   8,   0,   4]])

In [43]:
print('Missing: %d' % sum(isnan(Xtrans).flatten()))

Missing: 0


MODEL BUILDING

In [44]:
def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)

    n_samples = X.shape[0]
    n_test = int(n_samples * test_size)

    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    test_indices = indices[:n_test]
    train_indices = indices[n_test:]

    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

    return X_train, X_test, y_train, y_test

# Reset indices if X and y are DataFrames
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

# Assuming X is your features and y is your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [45]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 488 entries, 165 to 559
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   id               488 non-null    int64
 1   A1_Score         488 non-null    int64
 2   A2_Score         488 non-null    int64
 3   A3_Score         488 non-null    int64
 4   A4_Score         488 non-null    int64
 5   A5_Score         488 non-null    int64
 6   A6_Score         488 non-null    int64
 7   A7_Score         488 non-null    int64
 8   A8_Score         488 non-null    int64
 9   A9_Score         488 non-null    int64
 10  A10_Score        488 non-null    int64
 11  age              488 non-null    int64
 12  gender           488 non-null    int64
 13  ethnicity        488 non-null    int64
 14  jundice          488 non-null    int64
 15  austim           488 non-null    int64
 16  contry_of_res    488 non-null    int64
 17  used_app_before  488 non-null    int64
 18  result       

In [46]:
y_train.info()

<class 'pandas.core.series.Series'>
Index: 488 entries, 165 to 559
Series name: Class/ASD
Non-Null Count  Dtype
--------------  -----
488 non-null    int64
dtypes: int64(1)
memory usage: 7.6 KB


In [47]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 121 entries, 576 to 278
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   id               121 non-null    int64
 1   A1_Score         121 non-null    int64
 2   A2_Score         121 non-null    int64
 3   A3_Score         121 non-null    int64
 4   A4_Score         121 non-null    int64
 5   A5_Score         121 non-null    int64
 6   A6_Score         121 non-null    int64
 7   A7_Score         121 non-null    int64
 8   A8_Score         121 non-null    int64
 9   A9_Score         121 non-null    int64
 10  A10_Score        121 non-null    int64
 11  age              121 non-null    int64
 12  gender           121 non-null    int64
 13  ethnicity        121 non-null    int64
 14  jundice          121 non-null    int64
 15  austim           121 non-null    int64
 16  contry_of_res    121 non-null    int64
 17  used_app_before  121 non-null    int64
 18  result       

In [48]:
y_test.info()

<class 'pandas.core.series.Series'>
Index: 121 entries, 576 to 278
Series name: Class/ASD
Non-Null Count  Dtype
--------------  -----
121 non-null    int64
dtypes: int64(1)
memory usage: 1.9 KB


In [49]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def _compute_cost(self, y, y_pred):
        m = len(y)
        cost = -1 / m * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
        return cost

    def fit(self, X, y):
        self.theta = np.zeros(X.shape[1])
        m = len(y)

        for _ in range(self.num_iterations):
            z = np.dot(X, self.theta)
            y_pred = self._sigmoid(z)

            gradient = np.dot(X.T, (y_pred - y)) / m
            self.theta -= self.learning_rate * gradient

    def predict(self, X):
        z = np.dot(X, self.theta)
        y_pred = self._sigmoid(z)
        return np.round(y_pred)

# Example usage
LR = LogisticRegression()
LR.fit(X_train_np, y_train_np)
predictions = LR.predict(X_test_np)
accuracy = (predictions == y_test_np).mean() * 100
print("* Accuracy score for LR:", accuracy, "\n")


* Accuracy score for LR: 33.88429752066116 

