In [1]:
import numpy as np
import pandas as pd 

In [21]:


class GaussianNB:
    def __init__(self):
        self.classes = None
        self.mean = {}
        self.variance = {}
        self.priors = {}
    
    def fit(self, X, y):
        # Trouver toutes les classes uniques
        self.classes = np.unique(y)
        
        # Calculer les moyennes, variances et prior probabilities pour chaque classe
        for c in self.classes:
            X_c = X[y == c]
            self.mean[c] = np.mean(X_c, axis=0)
            self.variance[c] = np.var(X_c, axis=0)
            self.priors[c] = X_c.shape[0] / X.shape[0]
    
    def gaussian_density(self, class_idx, x):
        # Calculer la probabilité basée sur la distribution normale 
        mean = self.mean[class_idx]
        variance = self.variance[class_idx]
        numerator = np.exp(- (x - mean) ** 2 / (2 * variance))
        denominator = np.sqrt(2 * np.pi * variance)
        return numerator / denominator
    
    def predict(self, X):
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)
    
    def _predict_single(self, x):
        # Calculer la probabilité pour chaque classe
        posteriors = []
        
        for c in self.classes:
            prior = np.log(self.priors[c])  # Utiliser le log pour éviter les underflows
            conditional = np.sum(np.log(self.gaussian_density(c, x)))
            posterior = prior + conditional
            posteriors.append(posterior)
        
        # Retourner la classe avec la probabilité la plus élevée
        return self.classes[np.argmax(posteriors)]



In [22]:
salary_train=pd.read_csv("SalaryData_Train(1).csv")
salary_test=pd.read_csv("SalaryData_Test(1).csv")
salary_train.head(12)

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [23]:
from sklearn import preprocessing
number = preprocessing.LabelEncoder()
for column in salary_train.select_dtypes(include=['object']).columns:
    salary_train[column] = number.fit_transform(salary_train[column])
for column in salary_test.select_dtypes(include=['object']).columns:
    salary_test[column] = number.fit_transform(salary_test[column])
salary_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   age            30161 non-null  int64
 1   workclass      30161 non-null  int64
 2   education      30161 non-null  int64
 3   educationno    30161 non-null  int64
 4   maritalstatus  30161 non-null  int64
 5   occupation     30161 non-null  int64
 6   relationship   30161 non-null  int64
 7   race           30161 non-null  int64
 8   sex            30161 non-null  int64
 9   capitalgain    30161 non-null  int64
 10  capitalloss    30161 non-null  int64
 11  hoursperweek   30161 non-null  int64
 12  native         30161 non-null  int64
 13  Salary         30161 non-null  int64
dtypes: int64(14)
memory usage: 3.2 MB


In [24]:
x_train=salary_train.drop('Salary', axis=1)
x_test=salary_test.drop('Salary', axis=1)
y_train=salary_train['Salary']
y_test=salary_test['Salary']

In [25]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   age            30161 non-null  int64
 1   workclass      30161 non-null  int64
 2   education      30161 non-null  int64
 3   educationno    30161 non-null  int64
 4   maritalstatus  30161 non-null  int64
 5   occupation     30161 non-null  int64
 6   relationship   30161 non-null  int64
 7   race           30161 non-null  int64
 8   sex            30161 non-null  int64
 9   capitalgain    30161 non-null  int64
 10  capitalloss    30161 non-null  int64
 11  hoursperweek   30161 non-null  int64
 12  native         30161 non-null  int64
dtypes: int64(13)
memory usage: 3.0 MB


In [28]:
x_test.info()
x_train=np.array(x_train)
x_test=np.array(x_test)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15060 entries, 0 to 15059
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   age            15060 non-null  int64
 1   workclass      15060 non-null  int64
 2   education      15060 non-null  int64
 3   educationno    15060 non-null  int64
 4   maritalstatus  15060 non-null  int64
 5   occupation     15060 non-null  int64
 6   relationship   15060 non-null  int64
 7   race           15060 non-null  int64
 8   sex            15060 non-null  int64
 9   capitalgain    15060 non-null  int64
 10  capitalloss    15060 non-null  int64
 11  hoursperweek   15060 non-null  int64
 12  native         15060 non-null  int64
dtypes: int64(13)
memory usage: 1.5 MB


In [29]:
sgnb=GaussianNB()

In [32]:
sgnb.fit(x_train,y_train)
prediction = sgnb.predict(x_test)
print(prediction)
print(y_test)

[[25  2  1 ...  0 40 37]
 [38  2 11 ...  0 50 37]
 [28  1  7 ...  0 40 37]
 ...
 [38  2  9 ...  0 50 37]
 [44  2  9 ...  0 40 37]
 [35  3  9 ...  0 60 37]]
0        0
1        0
2        1
3        1
4        0
        ..
15055    0
15056    0
15057    0
15058    0
15059    1
Name: Salary, Length: 15060, dtype: int64
