In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [32]:
#load csv data
hiv_data = pd.read_csv('./HIV_dataset.csv')

In [33]:
#number of rows and columns
hiv_data.shape

(698, 10)

In [34]:
# csv data information
hiv_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype
---  ------                          --------------  -----
 0   Age                             698 non-null    int64
 1   Marital Staus                   698 non-null    int64
 2   STD                             698 non-null    int64
 3   Educational Background          698 non-null    int64
 4   HIV TEST IN PAST YEAR           698 non-null    int64
 5   AIDS education                  698 non-null    int64
 6   Places of seeking sex partners  698 non-null    int64
 7   SEXUAL ORIENTATION              698 non-null    int64
 8   Drug- taking                    698 non-null    int64
 9   Result                          698 non-null    int64
dtypes: int64(10)
memory usage: 54.7 KB


In [35]:
#check for null values
hiv_data.isnull().sum()

Age                               0
Marital Staus                     0
STD                               0
Educational Background            0
HIV TEST IN PAST YEAR             0
AIDS education                    0
Places of seeking sex partners    0
SEXUAL ORIENTATION                0
Drug- taking                      0
Result                            0
dtype: int64

In [36]:
#statistical measure of the data
hiv_data.describe()

Unnamed: 0,Age,Marital Staus,STD,Educational Background,HIV TEST IN PAST YEAR,AIDS education,Places of seeking sex partners,SEXUAL ORIENTATION,Drug- taking,Result
count,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0,698.0
mean,40.022923,2.246418,0.468481,2.507163,0.386819,0.408309,3.885387,1.30086,0.425501,0.495702
std,18.154906,1.144895,0.499363,1.389892,0.487371,0.491873,1.808172,1.083871,0.494773,0.50034
min,12.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,25.0,1.0,0.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0
50%,38.0,2.0,0.0,2.0,0.0,0.0,4.0,1.0,0.0,0.0
75%,54.0,3.0,1.0,4.0,1.0,1.0,5.0,3.0,1.0,1.0
max,80.0,5.0,1.0,5.0,1.0,1.0,6.0,4.0,1.0,1.0


In [37]:
#distribution of AIDS education
hiv_data['Result'].value_counts()

Result
0    352
1    346
Name: count, dtype: int64

In [38]:
#splitting distribution of education of AIDS
x = hiv_data.drop(columns ='Result', axis=1)
y = hiv_data['Result']

In [39]:
print(x)

     Age  Marital Staus  STD  Educational Background  HIV TEST IN PAST YEAR  \
0     22              2    0                       1                      1   
1     20              2    0                       1                      0   
2     23              1    1                       1                      0   
3     24              1    0                       1                      1   
4     18              2    1                       2                      1   
..   ...            ...  ...                     ...                    ...   
693   19              2    1                       4                      0   
694   47              1    0                       1                      0   
695   34              2    1                       2                      1   
696   25              1    0                       3                      0   
697   46              5    1                       2                      1   

     AIDS education  Places of seeking sex partners

In [40]:
print (y)

0      1
1      0
2      0
3      1
4      1
      ..
693    1
694    0
695    0
696    1
697    1
Name: Result, Length: 698, dtype: int64


In [41]:
#splitting data into training and test data
x_train, x_test, y_train, y_test =train_test_split(x,y, test_size=0.2, stratify=y, random_state =2)

In [42]:
print(x.shape ,x_train.shape, x_test.shape)

(698, 9) (558, 9) (140, 9)


In [43]:
#using logistic regression model to train data
model = LogisticRegression()

In [44]:
#train the model using training data
#finding relationship between feaures and AIDS education
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
#Evaluate the model perfomance by finding the accurancy score
#accurancy on the train data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [46]:
print('Training data accuaracy: ',training_data_accuracy)

Training data accuaracy:  0.7652329749103942


In [47]:
#accurancy on the test data
x_test_prediction = model.predict(x_test)
testing_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [48]:
print('Testing data accuaracy: ',testing_data_accuracy)

Testing data accuaracy:  0.8357142857142857


In [49]:
#Develop a system to predict whwther one is HIVpositive or not
input_data = (22,2,0,1,1,0,1,1,1) 
#convert the input data into a numpy array
input_data_numpy_array = np.asarray(input_data)

#reshape the numpy array
input_data_reshape = input_data_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshape)
print(prediction)

[0]




In [50]:
if(prediction[0] == 0):
    print("The person is HIV negative")
else:
    print('The person is HIV positive ')

The person is HIV negative
