In [2]:
!pip install pandas numpy scikit-learn

Collecting pandas
  Downloading pandas-2.3.2-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.3.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading pandas-2.3.2-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0

In [3]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.8.29-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------------------------

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
#Dataset Creation

In [8]:
data = {
    "Company": [
        "EcoPlast", "GreenEnergyCo", "FastFashionX", "PureDrinks", "AutoFuture", 
        "FreshFoods", "PowerCorp", "Clothify"
    ],
    "Claim": [
        "Our products are 100% biodegradable and safe for the planet.",
        "We are fully carbon neutral and invest in renewable energy.",
        "We launched an eco-friendly collection using sustainable cotton.",
        "Bottled water is packaged in recycled plastic bottles.",
        "Our electric cars produce zero emissions across their lifecycle.",
        "We source all vegetables locally to reduce carbon footprint.",
        "We cut down emissions by 50% in our coal-powered plants.",
        "We donate 2% of sales to plant trees worldwide."
    ],
    "Label": [
        "greenwashing",   
        "genuine",        
        "greenwashing",   
        "greenwashing",   
        "genuine",        
        "genuine",       
        "greenwashing",   
        "genuine"        
    ]
}


In [9]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Company,Claim,Label
0,EcoPlast,Our products are 100% biodegradable and safe f...,greenwashing
1,GreenEnergyCo,We are fully carbon neutral and invest in rene...,genuine
2,FastFashionX,We launched an eco-friendly collection using s...,greenwashing
3,PureDrinks,Bottled water is packaged in recycled plastic ...,greenwashing
4,AutoFuture,Our electric cars produce zero emissions acros...,genuine


In [10]:
print("Shape:", df.shape)

Shape: (8, 3)


In [11]:
print("\nMissing values:\n", df.isnull().sum())


Missing values:
 Company    0
Claim      0
Label      0
dtype: int64


In [12]:
print("\nLabel counts:\n", df['Label'].value_counts())


Label counts:
 Label
greenwashing    4
genuine         4
Name: count, dtype: int64


In [13]:
#Pre-Processing step1- Data Cleaning

In [15]:
import re

def clean_text(text):
    text = text.lower()                     # lowercase
    text = re.sub(r'[^a-z\s]', '', text)    # keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip() # remove extra spaces
    return text

df['Cleaned_Claim'] = df['Claim'].apply(clean_text)
df[['Claim', 'Cleaned_Claim']]

Unnamed: 0,Claim,Cleaned_Claim
0,Our products are 100% biodegradable and safe f...,our products are biodegradable and safe for th...
1,We are fully carbon neutral and invest in rene...,we are fully carbon neutral and invest in rene...
2,We launched an eco-friendly collection using s...,we launched an ecofriendly collection using su...
3,Bottled water is packaged in recycled plastic ...,bottled water is packaged in recycled plastic ...
4,Our electric cars produce zero emissions acros...,our electric cars produce zero emissions acros...
5,We source all vegetables locally to reduce car...,we source all vegetables locally to reduce car...
6,We cut down emissions by 50% in our coal-power...,we cut down emissions by in our coalpowered pl...
7,We donate 2% of sales to plant trees worldwide.,we donate of sales to plant trees worldwide


In [16]:
from sklearn.model_selection import train_test_split

X = df['Cleaned_Claim']   # feature
y = df['Label']           # target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training size:", X_train.shape[0])
print("Testing size:", X_test.shape[0])

Training size: 6
Testing size: 2


In [17]:
#Step2 - text vectorization

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF shape (train):", X_train_tfidf.shape)
print("TF-IDF shape (test):", X_test_tfidf.shape)

TF-IDF shape (train): (6, 30)
TF-IDF shape (test): (2, 30)
