In [None]:
!pip install boto3 pandas scikit-learn python-dotenv s3fs

In [2]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("data/fake_true_news_dataset.csv")

# Drop rows with missing text or label
df = df.dropna(subset=['text', 'label'])

# No mapping needed — labels already 0 (FAKE) and 1 (REAL)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Evaluate model
y_pred = model.predict(X_test_vec)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# Save model and vectorizer
with open("news_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("✅ Model and vectorizer saved.")



Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      4678
           1       0.94      0.93      0.93      4302

    accuracy                           0.93      8980
   macro avg       0.93      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980

✅ Model and vectorizer saved.


In [None]:
import tarfile

with tarfile.open("model.tar.gz", "w:gz") as tar:
    tar.add("news_model.pkl")
    tar.add("tfidf_vectorizer.pkl")

print("✅ model.tar.gz created with both files.")


In [None]:
import tarfile

with tarfile.open("model.tar.gz", "w:gz") as tar:
    tar.add("news_model.pkl")
    tar.add("tfidf_vectorizer.pkl")
    tar.add("inference.py")

print("✅ Updated model.tar.gz with inference.py")


In [None]:
import boto3
s3 = boto3.client("s3")


In [None]:
bucket = "news-truth-checker-data-2025"

s3.upload_file("model.tar.gz", bucket, "model.tar.gz")
print("✅ Re-uploaded updated model.tar.gz to S3")


In [None]:
from sagemaker import get_execution_role

role = get_execution_role()
print(role)


In [None]:
from dotenv import load_dotenv
import os
import boto3
import sagemaker

# Load from .env
load_dotenv()

aws_key = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret = os.getenv("AWS_SECRET_ACCESS_KEY")
region = os.getenv("AWS_REGION", "us-east-1")

# Build boto3 + sagemaker session with your credentials
boto_session = boto3.Session(
    aws_access_key_id=aws_key,
    aws_secret_access_key=aws_secret,
    region_name=region
)

sagemaker_session = sagemaker.Session(boto_session=boto_session)


In [None]:
import boto3
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from sagemaker.predictor import Predictor

# Connect to the existing endpoint
predictor = Predictor(
    endpoint_name="news-truth-checker-endpoint-v1",
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)

print("✅ Connected to endpoint.")


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle
import pandas as pd

# Load and clean dataset
df = pd.read_csv("data/fake_true_news_dataset.csv")
df = df.dropna(subset=['text', 'label'])

# Train/test split
X = df['text']
y = df['label']

# Fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_vec = vectorizer.fit_transform(X)

# Fit the model
model = MultinomialNB()
model.fit(X_vec, y)

# Save the fitted model and vectorizer
with open("news_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("✅ Re-saved fitted model and vectorizer.")


In [5]:
import json
from inference import model_fn, input_fn, predict_fn, output_fn

# Load model
model_objects = model_fn(".")

# Simulate SageMaker input
request_body = json.dumps({
    "text": "NASA confirms the discovery of water on the Moon."
})
content_type = "application/json"

# Simulate SageMaker pipeline
parsed_input = input_fn(request_body, content_type)
prediction = predict_fn(parsed_input, model_objects)
result = output_fn(prediction, content_type)

print("✅ Local prediction result:", result)


ImportError: cannot import name 'model_fn' from 'inference' (/home/sagemaker-user/src/inference.py)