In [1]:
import boto3
import pandas as pd

# Specify the S3 bucket and file path
bucket_name = 'titanicsagemaker'
file_key = 'data/titanic.csv'

# Download the file from S3 to local file system
s3_client = boto3.client('s3')
s3_client.download_file(bucket_name, file_key, 'titanic.csv')

# Read the data into a Pandas DataFrame
data = pd.read_csv('titanic.csv')

# Display the first few rows of the data
print(data.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [2]:
print(data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [3]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
df=data.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns')

In [5]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,Fare
count,891.0,891.0,714.0,891.0
mean,0.383838,2.308642,29.699118,32.204208
std,0.486592,0.836071,14.526497,49.693429
min,0.0,1.0,0.42,0.0
25%,0.0,2.0,20.125,7.9104
50%,0.0,3.0,28.0,14.4542
75%,1.0,3.0,38.0,31.0
max,1.0,3.0,80.0,512.3292


In [6]:
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'Fare'], dtype='object')

In [7]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [8]:
print(df.isnull().sum())

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
dtype: int64


In [9]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [10]:
df['sex']=encoder.fit_transform(df['Sex'])

In [11]:
df.drop(['Sex'],axis='columns')

Unnamed: 0,Survived,Pclass,Age,Fare,sex
0,0,3,22.000000,7.2500,1
1,1,1,38.000000,71.2833,0
2,1,3,26.000000,7.9250,0
3,1,1,35.000000,53.1000,0
4,0,3,35.000000,8.0500,1
...,...,...,...,...,...
886,0,2,27.000000,13.0000,1
887,1,1,19.000000,30.0000,0
888,0,3,29.699118,23.4500,0
889,1,1,26.000000,30.0000,1


In [12]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,sex
0,0,3,male,22.000000,7.2500,1
1,1,1,female,38.000000,71.2833,0
2,1,3,female,26.000000,7.9250,0
3,1,1,female,35.000000,53.1000,0
4,0,3,male,35.000000,8.0500,1
...,...,...,...,...,...,...
886,0,2,male,27.000000,13.0000,1
887,1,1,female,19.000000,30.0000,0
888,0,3,female,29.699118,23.4500,0
889,1,1,male,26.000000,30.0000,1


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X=df.drop(['Survived','Sex'],axis='columns')
y=df.Survived

In [15]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [17]:
train_data = pd.concat([X_train, X_test], axis=0, ignore_index=True)


In [18]:
test_data = pd.concat([y_train, y_test], axis=0, ignore_index=True)


In [19]:
# Save train and test dataframes to CSV files locally
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)


In [20]:
s3 = boto3.client('s3')

# Define the S3 bucket name
bucket_name = 'titanicsagemaker'

# Define local file paths
train_file = 'train_data.csv'
test_file = 'test_data.csv'

In [21]:
train_s3_path = 'data/train_data.csv'
test_s3_path = 'data/test_data.csv'

In [22]:
s3.upload_file(train_file, bucket_name, train_s3_path)
print(f"Train data uploaded successfully to {train_s3_path}")
    
# Upload test data to 'data' folder
s3.upload_file(test_file, bucket_name, test_s3_path)
print(f"Test data uploaded successfully to {test_s3_path}")

Train data uploaded successfully to data/train_data.csv
Test data uploaded successfully to data/test_data.csv


In [23]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [24]:
X_train


Unnamed: 0,Pclass,Age,Fare,sex
705,2,39.000000,26.00,1
47,3,29.699118,7.75,0
616,3,34.000000,14.40,1
560,3,29.699118,7.75,1
419,3,10.000000,24.15,0
...,...,...,...,...
674,2,29.699118,0.00,1
283,3,19.000000,8.05,1
461,3,34.000000,8.05,1
564,3,29.699118,8.05,0


In [25]:
model.fit(X_train,y_train)

In [26]:
model.predict(X_test)

array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0])

In [27]:
accuracy=model.score(X_test, y_test)

In [28]:
print(f"Model accuracy: {accuracy:.2f}")

Model accuracy: 0.80


In [29]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
folds=StratifiedKFold(n_splits=3)

In [30]:
cross_val_score(LogisticRegression(),X,y)

array([0.7877095 , 0.78089888, 0.78651685, 0.7752809 , 0.80337079])

In [31]:
import joblib
joblib.dump(model, 'model.joblib')

['model.joblib']

In [32]:
model = joblib.load('model.joblib')

In [33]:
import tarfile

# Create a tar.gz file containing the .joblib model
with tarfile.open('model.tar.gz', 'w:gz') as tar:
    tar.add('model.joblib', arcname='model.joblib')

# Optionally, remove the intermediate joblib file
import os
os.remove('model.joblib')


In [34]:
import boto3

# Initialize boto3 S3 client
s3_client = boto3.client('s3')

# Specify the S3 bucket and the folder path in S3
bucket_name = 'titanicsagemaker'  # Replace with your bucket name
s3_file_path = 'model/model.tar.gz'  # The path inside the 'model' folder

# Upload the model .tar.gz file to S3
with open('model.tar.gz', 'rb') as data:
    s3_client.upload_fileobj(data, bucket_name, s3_file_path)



print(f"Model uploaded to s3://{bucket_name}/{s3_file_path}")


Model uploaded to s3://titanicsagemaker/model/model.tar.gz


In [35]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.model import SKLearnModel

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [36]:
# Get the SageMaker execution role
role = get_execution_role()

In [37]:
# Define the S3 URI for your model
model_s3_path = 's3://titanicsagemaker/model/model.tar.gz'