In [4]:
import pandas as pd

# Function to read and process the file
def process_data(file_path):
    data = []
    labels = []

    with open(file_path, 'r') as file:
        for line in file:
            # Split the first part (label) and the rest (features)
            parts = line.strip().split(' ', 1)
            labels.append(parts[0])
            features = parts[1].split()

            # Create a dictionary of feature index and values
            feature_dict = {int(f.split(':')[0]): float(f.split(':')[1]) for f in features}
            data.append(feature_dict)

    # Convert to DataFrame, fill missing columns with 0 if feature indexes are skipped
    df = pd.DataFrame(data).fillna(0)

    # Add labels as a new column
    df['Label'] = labels

    return df

# Usage
file_path = 'diabetes_scale.txt'  # Replace with your file path
df = process_data(file_path)

# Show the first few rows of the dataframe
df.head()


Unnamed: 0,1,2,3,4,5,6,7,8,Label
0,-0.294118,0.487437,0.180328,-0.292929,-1.0,0.00149,-0.53117,-0.033333,-1
1,-0.882353,-0.145729,0.081967,-0.414141,-1.0,-0.207153,-0.766866,-0.666667,1
2,-0.058824,0.839196,0.04918,-1.0,-1.0,-0.305514,-0.492741,-0.633333,-1
3,-0.882353,-0.105528,0.081967,-0.535354,-0.777778,-0.162444,-0.923997,-1.0,1
4,-1.0,0.376884,-0.344262,-0.292929,-0.602837,0.28465,0.887276,-0.6,-1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1       768 non-null    float64
 1   2       768 non-null    float64
 2   3       768 non-null    float64
 3   4       768 non-null    float64
 4   5       768 non-null    float64
 5   6       768 non-null    float64
 6   7       768 non-null    float64
 7   8       768 non-null    float64
 8   Label   768 non-null    object 
dtypes: float64(8), object(1)
memory usage: 54.1+ KB


In [8]:
df.isnull().sum()

1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
Label    0
dtype: int64

In [13]:
import numpy as np

class Perceptron:
    def __init__(self, learning_rate=0.01, n_iterations=1000, random_state=42):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.random_state = random_state
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        np.random.seed(self.random_state)
        n_samples, n_features = X.shape
        
        # Initialize weights and bias
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        # Convert labels to +1 and -1 for Perceptron learning
        y_ = np.where(y <= 0, -1, 1)
        
        # Training process
        for i in range(self.n_iterations):
            # Shuffle data for better convergence
            indices = np.random.permutation(n_samples)
            X = X[indices]
            y_ = y_[indices]
            
            for idx, x_i in enumerate(X):
                linear_output = np.dot(x_i, self.weights) + self.bias
                y_predicted = self._activation_function(linear_output)
                
                # Update rule
                if y_[idx] * y_predicted <= 0:
                    self.weights += self.learning_rate * y_[idx] * x_i
                    self.bias += self.learning_rate * y_[idx]

            # Optional: Decrease learning rate over time
            self.learning_rate *= 0.99  # decay factor

    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias
        y_predicted = self._activation_function(linear_output)
        return np.where(y_predicted > 0, 1, 0)

    def _activation_function(self, x):
        return np.where(x >= 0, 1, -1)

# Step 1: Normalize the data (Feature scaling)
def normalize(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    return (X - mean) / std

# Load and parse the data (example data function)
def parse_data(data_file):
    X = []
    y = []
    with open(data_file, 'r') as file:
        for line in file:
            line = line.strip().split()
            label = int(line[0])
            features = [0] * 8  # Assuming 8 features in each row
            for item in line[1:]:
                index, value = item.split(':')
                features[int(index) - 1] = float(value)
            X.append(features)
            y.append(label)
    return np.array(X), np.array(y)

# Load data
data_file = 'diabetes_scale.txt'  # Replace with the path to your data file
X, y = parse_data(data_file)

# Step 3: Normalize the features
X = normalize(X)

# Step 4: Increase the number of iterations for training
perceptron = Perceptron(learning_rate=0.05, n_iterations=1500)  # Increase iterations
perceptron.fit(X, y)

# Step 5: Make predictions
y_pred = perceptron.predict(X)

# Step 6: Evaluate the model
accuracy = np.mean(y == y_pred)
print("Perceptron Model Accuracy:", accuracy)

# Optional: Visualize the weights and bias
print("Model weights:", perceptron.weights)
print("Model bias:", perceptron.bias)


Perceptron Model Accuracy: 0.5520833333333334
Model weights: [-2.12801643e-08 -1.18203230e-08  3.34205879e-08 -1.70400290e-08
  3.65050355e-08  1.56317152e-08 -4.94328727e-08  2.86537185e-08]
Model bias: 5.557666919822885e-08


In [12]:
learning_rates = [0.001, 0.01, 0.1, 0.5]
for lr in learning_rates:
    print(f"Training with learning rate: {lr}")
    perceptron = Perceptron(learning_rate=lr, n_iterations=2000)
    perceptron.fit(X, y)
    y_pred = perceptron.predict(X)
    accuracy = np.mean(y == y_pred)
    print(f"Accuracy: {accuracy}\n")


Training with learning rate: 0.001
Accuracy: 0.51953125

Training with learning rate: 0.01
Accuracy: 0.51953125

Training with learning rate: 0.1
Accuracy: 0.51953125

Training with learning rate: 0.5
Accuracy: 0.51953125



In [15]:
iterations = [1000, 1500 ,2000, 2500, 3000]
for it in iterations:
    print(f"Training with {it} iterations")
    perceptron = Perceptron(learning_rate=0.01, n_iterations=it)
    perceptron.fit(X, y)
    y_pred = perceptron.predict(X)
    accuracy = np.mean(y == y_pred)
    print(f"Accuracy: {accuracy}\n")

Training with 1000 iterations
Accuracy: 0.49609375

Training with 1500 iterations
Accuracy: 0.5520833333333334

Training with 2000 iterations
Accuracy: 0.51953125

Training with 2500 iterations
Accuracy: 0.46875

Training with 3000 iterations
Accuracy: 0.4166666666666667

