In [12]:
# Loading Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# See max columns
pd.set_option('display.max_columns', None)

In [7]:
#loading data set
adult = pd.read_csv("Clean_Data.csv")

#getting columns names
adult.columns

Index(['SRVY_YR', 'URBRRL', 'REGION', 'AGEP_A', 'SEX_A', 'HISP_A',
       'RACEALLP_A', 'MLTFAMFLG_A', 'PHSTAT_A', 'HYPEV_A', 'CHLEV_A',
       'ANGEV_A', 'MIEV_A', 'STREV_A', 'ASEV_A', 'NUMCAN_A', 'HEIGHTTC_A',
       'WEIGHTLBTC_A', 'BMICAT_A', 'DISAB3_A', 'NOTCOV_A', 'PAYWORRY_A',
       'URGNT12MTC_A', 'EMERG12MTC_A', 'ANXLEVEL_A', 'DEPLEVEL_A',
       'SMKCIGST_A', 'SMKECIGST_A', 'LEGMSTAT_A', 'PARSTAT_A', 'CITZNSTP_A',
       'SCHCURENR_A', 'POVRATTC_A', 'FSNAP12M_A', 'FDSCAT4_A', 'HOUTENURE_A',
       'CHDEV_A', 'DIA_STATUS'],
      dtype='object')

In [8]:
#Droping columns
adult_new = adult.drop(columns = ['CHDEV_A','HOUTENURE_A','SCHCURENR_A','CITZNSTP_A','PARSTAT_A','LEGMSTAT_A','MLTFAMFLG_A',
                                  'FSNAP12M_A','HEIGHTTC_A','WEIGHTLBTC_A'])

In [10]:
adult_new.head()

Unnamed: 0,SRVY_YR,URBRRL,REGION,AGEP_A,SEX_A,HISP_A,RACEALLP_A,PHSTAT_A,HYPEV_A,CHLEV_A,ANGEV_A,MIEV_A,STREV_A,ASEV_A,NUMCAN_A,HEIGHTTC_A,WEIGHTLBTC_A,BMICAT_A,DISAB3_A,NOTCOV_A,PAYWORRY_A,URGNT12MTC_A,EMERG12MTC_A,ANXLEVEL_A,DEPLEVEL_A,SMKCIGST_A,SMKECIGST_A,POVRATTC_A,FSNAP12M_A,FDSCAT4_A,DIA_STATUS
0,2022,2,3,85,1,2,1,2,1,2,2,2,2,2,0,68,148,2,2,2,3,0,0,2.0,0.0,3,3,1.92,2,1,0.0
1,2022,4,3,64,1,2,1,3,1,1,1,2,2,2,1,74,235,4,2,2,3,0,0,1.0,0.0,3,3,10.3,2,1,0.0
2,2022,4,3,37,2,2,1,3,2,2,2,2,2,1,0,69,218,4,2,2,3,1,1,3.0,1.0,4,3,9.36,2,1,3.0
3,2022,4,3,72,2,2,1,2,1,2,2,2,2,1,0,64,240,4,2,2,3,0,0,0.0,0.0,4,3,3.66,2,1,0.0
4,2022,1,3,84,2,2,2,2,1,2,2,2,2,2,0,66,183,3,2,2,2,0,0,1.0,0.0,4,3,0.69,2,1,0.0


In [8]:
# Checking for null values
adult_new.isnull().sum()

SRVY_YR           int64
URBRRL            int64
REGION            int64
AGEP_A            int64
SEX_A             int64
HISP_A            int64
RACEALLP_A        int64
PHSTAT_A          int64
HYPEV_A           int64
CHLEV_A           int64
ANGEV_A           int64
MIEV_A            int64
STREV_A           int64
ASEV_A            int64
NUMCAN_A          int64
HEIGHTTC_A        int64
WEIGHTLBTC_A      int64
BMICAT_A          int64
DISAB3_A          int64
NOTCOV_A          int64
PAYWORRY_A        int64
URGNT12MTC_A      int64
EMERG12MTC_A      int64
ANXLEVEL_A      float64
DEPLEVEL_A      float64
SMKCIGST_A        int64
SMKECIGST_A       int64
POVRATTC_A      float64
FSNAP12M_A        int64
FDSCAT4_A         int64
DIA_STATUS      float64
dtype: object

In [None]:
#Check for data types
adult_new.dtypes

In [14]:
# Checking for unique variables for each column
for column in adult_new.columns:
    unique_values = adult_new[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

Unique values in column 'SRVY_YR': [2022 2021 2020 2019]
Unique values in column 'URBRRL': [2 4 1 3]
Unique values in column 'REGION': [3 4 1 2]
Unique values in column 'AGEP_A': [85 64 37 72 84 31 81 68 57 78 59 27 58 38 83 46 42 19 74 32 25 45 47 62
 61 35 44 24 55 69 63 43 80 60 23 65 36 53 50 76 20 26 52 39 41 73 40 67
 34 71 29 18 21 33 30 77 49 82 51 48 22 79 75 56 70 28 66 54 97 99]
Unique values in column 'SEX_A': [1 2]
Unique values in column 'HISP_A': [2 1]
Unique values in column 'RACEALLP_A': [1 2 6 3 4 5]
Unique values in column 'PHSTAT_A': [2 3 1 5 4]
Unique values in column 'HYPEV_A': [1 2]
Unique values in column 'CHLEV_A': [2 1]
Unique values in column 'ANGEV_A': [2 1]
Unique values in column 'MIEV_A': [2 1]
Unique values in column 'STREV_A': [2 1]
Unique values in column 'ASEV_A': [2 1]
Unique values in column 'NUMCAN_A': [0 1 2 3 4]
Unique values in column 'HEIGHTTC_A': [68 74 69 64 66 71 63 67 62 65 73 72 70 96 61 59 76 75 60]
Unique values in column 'WEIGHTLBTC_A':

In [None]:
# Split data
X = adult_new['']
y = adult_new.drop('')
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state = 42)

In [11]:
adult_new['SRVY_YR'].unique()

array([2022, 2021, 2020, 2019], dtype=int64)