<a href="https://colab.research.google.com/github/IT22232236/fake-news-detection/blob/feat%2FmemberA-preprocess/notebooks/01_memberA_data_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# === BASIC SETUP (run once per session) ===
REPO = "fake-news-detection"
USER_OR_ORG = "IT22232236"
TOKEN = "token"  # paste at runtime ONLY
BRANCH = "feat/memberA-preprocess" # your branch name

!git config --global user.email "it22232236@my.sliit.lk"
!git config --global user.name "IT22232236"

!git clone https://{TOKEN}@github.com/{USER_OR_ORG}/{REPO}.git
%cd /content/{REPO}
!git checkout -b {BRANCH}


Cloning into 'fake-news-detection'...
/content/fake-news-detection
Switched to a new branch 'feat/memberA-preprocess'


In [4]:
from google.colab import files
uploaded = files.upload()      # choose both Fake.csv and True.csv together


Saving Fake.csv to Fake.csv
Saving True.csv to True.csv


In [5]:
import pandas as pd

fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')

print("Fake:", fake.shape)
print("True:", true.shape)
print(fake.columns.tolist())   # should show ['title','text','subject','date']

Fake: (23481, 4)
True: (21417, 4)
['title', 'text', 'subject', 'date']


In [6]:
fake['label'] = 1   # 1 = fake
true['label'] = 0   # 0 = real

df = pd.concat([
    fake[['title','text','label']],
    true[['title','text','label']]
], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle

# Combine title + text into one column for NLP
df['text'] = (df['title'].fillna('') + ' ' + df['text'].fillna('')).str.strip()
df = df[['text','label']]
df.head()

Unnamed: 0,text,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,1
1,Trump drops Steve Bannon from National Securit...,0
2,Puerto Rico expects U.S. to lift Jones Act shi...,0
3,OOPS: Trump Just Accidentally Confirmed He Lea...,1
4,Donald Trump heads for Scotland to reopen a go...,0


In [7]:
df.to_csv('fake-news-dataset.csv', index=False)
print(" Saved fake-news-dataset.csv")


 Saved fake-news-dataset.csv


In [11]:
!pip install emoji unidecode nltk


Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode, emoji
Successfully installed emoji-2.15.0 unidecode-1.4.0


In [12]:
# --- CLEANING ---
import re, emoji
from unidecode import unidecode
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

def clean_text(s):
    if pd.isna(s): return ""
    s = unidecode(str(s))
    s = emoji.replace_emoji(s, replace=' ')
    s = s.lower()
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return " ".join(w for w in s.split() if w not in STOPWORDS)

df['text_clean'] = df['text'].apply(clean_text)
df = df[df['text_clean'].str.len() > 0]
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,text,label,text_clean
0,Ben Stein Calls Out 9th Circuit Court: Committ...,1,ben stein calls 9th circuit court committed co...
1,Trump drops Steve Bannon from National Securit...,0,trump drops steve bannon national security cou...
2,Puerto Rico expects U.S. to lift Jones Act shi...,0,puerto rico expects u lift jones act shipping ...
3,OOPS: Trump Just Accidentally Confirmed He Lea...,1,oops trump accidentally confirmed leaked israe...
4,Donald Trump heads for Scotland to reopen a go...,0,donald trump heads scotland reopen golf resort...


In [13]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import numpy as np, os

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

# Optional oversample
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(train_df[['text_clean']], train_df['label'])
train_df = pd.concat([X_res, y_res], axis=1)

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)

Train: (37554, 2) Val: (4489, 3) Test: (4489, 3)


In [14]:
os.makedirs('data/processed', exist_ok=True)

train_df.to_csv('data/processed/train.csv', index=False)
val_df.to_csv('data/processed/val.csv', index=False)
test_df.to_csv('data/processed/test.csv', index=False)

print("Saved train/val/test in data/processed/")

Saved train/val/test in data/processed/


In [17]:
!git add notebooks/01_memberA_data_preprocess.ipynb
!git commit -m "Member 1 – preprocessing complete"
!git push --set-upstream origin feat/memberA-preprocess

fatal: pathspec 'notebooks/01_memberA_data_preprocess.ipynb' did not match any files
On branch feat/memberA-preprocess

Initial commit

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mFake.csv[m
	[31mTrue.csv[m
	[31mdata/[m
	[31mfake-news-dataset.csv[m

nothing added to commit but untracked files present (use "git add" to track)
error: src refspec feat/memberA-preprocess does not match any
[31merror: failed to push some refs to 'https://github.com/IT22232236/fake-news-detection.git'
[m

In [18]:
# Move your notebook into the correct folder (if it's in /content)
!mkdir -p notebooks
!mv *.ipynb notebooks/01_memberA_data_preprocess.ipynb


mv: cannot stat '*.ipynb': No such file or directory


In [20]:
%%bash
mkdir -p notebooks data/raw data/processed results/figs report src

# create .gitignore
cat > .gitignore <<'EOF'
data/*
!data/README.md
results/*
!results/.gitkeep
*.csv
*.tsv
*.jsonl
*.joblib
*.pt
*.h5
*.keras
.kaggle/
.ipynb_checkpoints/
__pycache__/
EOF

# minimal placeholders
echo "Raw & processed data live here but are ignored by git." > data/README.md
touch results/.gitkeep
echo "## Fake News Detection" > README.md
echo "pandas
numpy
scikit-learn
imbalanced-learn
nltk
emoji
unidecode
matplotlib" > requirements.txt


In [21]:
!git add .gitignore data/README.md results/.gitkeep README.md requirements.txt
!git commit -m "Initial repo structure: notebooks/, data/, results/, .gitignore, README, requirements"
!git push --set-upstream origin feat/memberA-preprocess


[feat/memberA-preprocess (root-commit) ebf1776] Initial repo structure: notebooks/, data/, results/, .gitignore, README, requirements
 5 files changed, 24 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 data/README.md
 create mode 100644 requirements.txt
 create mode 100644 results/.gitkeep
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 2 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (9/9), 788 bytes | 788.00 KiB/s, done.
Total 9 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/IT22232236/fake-news-detection.git
 * [new branch]      feat/memberA-preprocess -> feat/memberA-preprocess
Branch 'feat/memberA-preprocess' set up to track remote branch 'feat/memberA-preprocess' from 'origin'.
