In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv('youth_tobacco_survey_yts_data.csv')

In [3]:
# Display the first few rows to ensure the data is loaded correctly
df.head()

Unnamed: 0,year,locationabbr,locationdesc,topictype,topicdesc,measuredesc,datasource,response,data_value_unit,data_value_type,...,geolocation,topictypeid,topicid,measureid,stratificationid1,stratificationid2,stratificationid3,stratificationid4,submeasureid,displayorder
0,2015,AZ,Arizona,Tobacco Use – Survey Data,Cessation (Youth),Percent of Current Smokers Who Want to Quit,YTS,,%,Percentage,...,POINT(-111.76381127699972 34.865970280000454),BEH,105BEH,170CES,1GEN,8AGE,6RAC,1EDU,YTS01,1
1,2015,AZ,Arizona,Tobacco Use – Survey Data,Cessation (Youth),Percent of Current Smokers Who Want to Quit,YTS,,%,Percentage,...,POINT(-111.76381127699972 34.865970280000454),BEH,105BEH,170CES,2GEN,8AGE,6RAC,1EDU,YTS02,2
2,2015,AZ,Arizona,Tobacco Use – Survey Data,Cessation (Youth),Percent of Current Smokers Who Want to Quit,YTS,,%,Percentage,...,POINT(-111.76381127699972 34.865970280000454),BEH,105BEH,170CES,3GEN,8AGE,6RAC,1EDU,YTS03,3
3,2015,AZ,Arizona,Tobacco Use – Survey Data,Cessation (Youth),Quit Attempt in Past Year Among Current Cigare...,YTS,,%,Percentage,...,POINT(-111.76381127699972 34.865970280000454),BEH,105BEH,169QUA,1GEN,8AGE,6RAC,1EDU,YTS04,4
4,2015,AZ,Arizona,Tobacco Use – Survey Data,Cessation (Youth),Quit Attempt in Past Year Among Current Cigare...,YTS,,%,Percentage,...,POINT(-111.76381127699972 34.865970280000454),BEH,105BEH,169QUA,2GEN,8AGE,6RAC,1EDU,YTS05,5


In [4]:
df.isnull().sum()

year                             0
locationabbr                     0
locationdesc                     0
topictype                        0
topicdesc                        0
measuredesc                      0
datasource                       0
response                      2246
data_value_unit                  0
data_value_type                  0
data_value                     425
data_value_footnote_symbol    9369
data_value_footnote           9369
data_value_std_err             425
low_confidence_limit           425
high_confidence_limit          425
sample_size                    425
gender                           0
race                             0
age                              0
education                        0
geolocation                      4
topictypeid                      0
topicid                          0
measureid                        0
stratificationid1                0
stratificationid2                0
stratificationid3                0
stratificationid4   

In [5]:
# Drop columns with high missing values that are not relevant
df.drop(['data_value_footnote_symbol', 'data_value_footnote'], axis=1, inplace=True)

In [6]:
# Drop rows where 'response' (our target variable) is missing
# Because the KNN algorithm requires complete data for both features and target labels

df.dropna(subset=['response'], inplace=True)

In [7]:
df.isnull().sum()

year                     0
locationabbr             0
locationdesc             0
topictype                0
topicdesc                0
measuredesc              0
datasource               0
response                 0
data_value_unit          0
data_value_type          0
data_value               0
data_value_std_err       0
low_confidence_limit     0
high_confidence_limit    0
sample_size              0
gender                   0
race                     0
age                      0
education                0
geolocation              0
topictypeid              0
topicid                  0
measureid                0
stratificationid1        0
stratificationid2        0
stratificationid3        0
stratificationid4        0
submeasureid             0
displayorder             0
dtype: int64

In [8]:
df.dtypes

year                       int64
locationabbr              object
locationdesc              object
topictype                 object
topicdesc                 object
measuredesc               object
datasource                object
response                  object
data_value_unit           object
data_value_type           object
data_value               float64
data_value_std_err       float64
low_confidence_limit     float64
high_confidence_limit    float64
sample_size              float64
gender                    object
race                      object
age                       object
education                 object
geolocation               object
topictypeid               object
topicid                   object
measureid                 object
stratificationid1         object
stratificationid2         object
stratificationid3         object
stratificationid4         object
submeasureid              object
displayorder               int64
dtype: object

In [9]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode categorical variables (gender, locationabbr, response)
le_gender = LabelEncoder()
df['gender'] = le_gender.fit_transform(df['gender'])

le_location = LabelEncoder()
df['locationabbr'] = le_location.fit_transform(df['locationabbr'])

In [10]:
df['response'].unique()

array(['Current', 'Ever', 'Frequent'], dtype=object)

In [11]:
df['response']=df['response'].map({ 'Current':0,'Ever':1,'Frequent':2})

In [12]:
# drop null values
df.dropna(inplace=True)

In [13]:
X = df[['gender', 'locationabbr', 'data_value']]
y = df['response']

In [14]:
X

Unnamed: 0,gender,locationabbr,data_value
6,2,2,3.2
7,1,2,3.2
8,0,2,3.1
9,2,2,12.5
10,1,2,12.9
...,...,...,...
9789,2,40,1.1
9790,1,40,7.7
9791,1,40,1.9
9792,0,40,0.2


In [15]:
y

6       0
7       0
8       0
9       1
10      1
       ..
9789    2
9790    2
9791    2
9792    2
9793    2
Name: response, Length: 7548, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [17]:
from sklearn.preprocessing import PolynomialFeatures

# Apply Polynomial Features
poly = PolynomialFeatures(degree=5)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the LiRegressionnear  model
model = LinearRegression()

In [19]:
model.fit(X_train_poly, y_train)

In [20]:
# Make predictions on the test set
y_pred = model.predict(X_test_poly)

In [21]:
y_pred

array([1.23768743, 0.64534292, 1.65273397, ..., 0.74479987, 0.61005264,
       1.12100992])

In [22]:
# Since this is a classification problem, round the predictions
y_pred_rounded = np.round(y_pred)


In [23]:
y_pred_rounded

array([1., 1., 2., ..., 1., 1., 1.])

In [24]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred_rounded)

# Display the results
print(f'Accuracy: {accuracy}')

Accuracy: 0.43667196608373077


In [25]:
# Generate a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_rounded))

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.01      0.02       613
           1       0.37      0.99      0.54       633
           2       0.94      0.30      0.46       641

    accuracy                           0.44      1887
   macro avg       0.57      0.43      0.34      1887
weighted avg       0.57      0.44      0.34      1887



In [26]:
conf_matrix = confusion_matrix(y_test, y_pred_rounded)
print(f'Confusion Matrix:\n{conf_matrix}')

Confusion Matrix:
[[  5 597  11]
 [  6 625   2]
 [  2 445 194]]
