In [None]:
import albumentations as A
import cv2
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
print(f"Torch: {torch.__version__}")

In [None]:
config = {
    'batch_size': 256,
    'num_workers': 4,
    'lr': 0.0001,
    'epochs': 10,
    'device': 'cpu',
    'image_size': 128
}

In [None]:
transforms = A.Compose(
    [
        A.Resize(height=config["image_size"], width=config["image_size"], p=1),
        A.RandomCrop(height=config["image_size"], width=config["image_size"], p=1),
        A.HorizontalFlip(p=0.8),
        A.ShiftScaleRotate(p=0.5),
        A.RandomBrightnessContrast(p=0.8), 
    ],
  
    
    p=1,
)



In [None]:
class RealFakeDataset(Dataset):
    def __init__(self, data_path, target=None, is_test=False, augmentation=None):
        super().__init__()
        self.data_path = data_path
        self.target = target
        self.is_test = is_test
        self.augmentation = augmentation

    def __len__(self):
        return len(self.data_path)

    def __getitem__(self, item):
        image = cv2.imread(self.data_path[item])

        if self.augmentation:
            sample = self.augmentation(image=image)
            image = sample["image"]
        
        if self.is_test:
            return torch.tensor(np.moveaxis(image, -1, 0), dtype=torch.float)
        return torch.tensor(np.moveaxis(image, -1, 0), dtype=torch.float), torch.tensor(
            self.target[item], dtype=torch.float
        )

In [None]:
!kaggle competitions download -c cmc-robust-real-vs-fake

In [None]:
df = pd.read_csv("cmc-robust-real-vs-fake/train.csv")

In [None]:
train, val = train_test_split(df)
train

In [None]:
train_paths = [f"cmc-robust-real-vs-fake/train/{i}.jpg" for i in train["id"].values]
train_target = train["label"].values

In [None]:
valid_paths = [f"cmc-robust-real-vs-fake/train/{i}.jpg" for i in val["id"].values]
valid_target = val["label"].values

In [None]:
train_dataset = RealFakeDataset(
    train_paths,
    train_target,
    is_test=False,
    augmentation=transforms,
)
valid_dataset = RealFakeDataset(
    valid_paths,
    valid_target,
    is_test=False,
    augmentation=transforms
)

train_loader = DataLoader(
    train_dataset,
    batch_size=config["batch_size"],
    shuffle=True,
    num_workers=config["num_workers"],
    drop_last=True,
)
valid_loader = DataLoader(
    valid_dataset,
    batch_size=config["batch_size"],
    shuffle=False,
    num_workers=config["num_workers"],
    drop_last=False,
)

In [None]:
class RealFakeNN(nn.Module):        
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(         
            nn.Conv2d(
                in_channels=3,              
                out_channels=16,            
                kernel_size=5,              
                stride=1,                   
                padding=2,                  
            ),      
            nn.MaxPool2d(kernel_size=2),  
            nn.ReLU(),                      
              
        )
        self.conv2 = nn.Sequential(         
            nn.Conv2d(16, 64, 5, 1, 2),
            nn.MaxPool2d(2),     
            nn.ReLU(),                      
                       
        )
        
        self.conv3 = nn.Sequential(         
            nn.Conv2d(64, 64, 5, 1, 2), 
            nn.MaxPool2d(2),  
            nn.ReLU(),                      
                         
        )
        self.conv3 = nn.Sequential(         
            nn.Conv2d(64, 128, 5, 1, 2),     
            nn.MaxPool2d(2),  
            nn.ReLU(),     
                      
        )
        self.conv4 = nn.Sequential(         
            nn.Conv2d(128, 128, 5, 1, 2),   
            nn.MaxPool2d(2), 
            nn.ReLU(),                      
                         
        )
        self.conv5 = nn.Sequential(         
            nn.Conv2d(128, 64, 5, 1, 2),   
            nn.MaxPool2d(2), 
            
            nn.ReLU(),                      
                         
        )
        self.conv6 = nn.Sequential(         
            nn.Conv2d(64, 64, 5, 1, 2),   
            nn.MaxPool2d(2), 
            nn.ReLU(),     
            nn.Dropout(0.25),
                         
        )
        self.conv7 = nn.Sequential(         
            nn.Conv2d(64, 32, 5, 1, 2),   
            nn.MaxPool2d(2), 
            nn.BatchNorm2d(32),
            nn.ReLU(),    
                         
        )

        self.out = nn.LazyLinear(1)
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = self.conv7(x)
        x = torch.flatten(x, 1)
        return self.out(x)
    
    
    
model = RealFakeNN()
model.to(config["device"])

In [None]:
criterion = F.binary_cross_entropy_with_logits

optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])

In [None]:
def train_fn(data_loader, model, optimizer, criterion, device):
    sum_loss = 0
    model.train()

    for bi, batch in tqdm(enumerate(data_loader), total=len(data_loader)):
        X, targets = batch
        X = X.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(X)
        outputs = outputs.squeeze(1)

        loss = criterion(outputs, targets)
        loss.backward()
        sum_loss += loss.detach().item()

        optimizer.step()

    return sum_loss / len(data_loader)

def eval_fn(data_loader, model, criterion, device):
    model.eval()
    sum_loss = 0
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, batch in tqdm(enumerate(data_loader), total=len(data_loader)):
            X, targets = batch
            X = X.to(device)
            targets = targets.to(device)

            outputs = model(X)
            outputs = outputs.squeeze(1)

            loss = criterion(outputs, targets)
            sum_loss += loss.detach().item()
            
            fin_targets.extend(targets.tolist())
            fin_outputs.extend(outputs.tolist())

    roc = roc_auc_score(fin_targets, fin_outputs)
    return sum_loss / len(data_loader), roc

def predict_fn(data_loader, model, device):
    model.eval()
    fin_outputs = []
    with torch.no_grad():
        for bi, batch in tqdm(enumerate(data_loader), total=len(data_loader)):
            X = batch
            X = X.to(device)

            outputs = model(X)
            outputs = outputs.squeeze(1)

            fin_outputs.extend(outputs.tolist())

    return fin_outputs

In [None]:
train_losses = []
val_losses = []
roc_auc_scores = []

In [None]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [None]:
best_loss = 1000
for epoch in range(config['epochs']): 
    train_loss = train_fn(train_loader, model, optimizer, criterion, config["device"])
    val_loss, metric = eval_fn(valid_loader, model, criterion, config["device"])
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    roc_auc_scores.append(metric)
    
    print(
        f"""
        epoch = {epoch},
        Train loss = {train_loss},
        Validation loss = {val_loss},
        ROC AUC = {metric}
        """
    )
    if val_loss < best_loss:
        print("Model saved!")
        best_loss = val_loss
        torch.save(model,"my_model.pt")

In [None]:
plt.figure(figsize=(10,5))
plt.title("Training and Validation Loss")
plt.plot(val_losses,label="val")
plt.plot(train_losses,label="train")
plt.plot(roc_auc_scores, label="auroc")
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
model = torch.load("my_model.pt")
model.to(config["device"])
model.eval()

submission = pd.read_csv("cmc-robust-real-vs-fake/submission.csv")
test_paths = [f"cmc-robust-real-vs-fake/test/{i}.jpg" for i in submission["id"].values]


test_dataset = RealFakeDataset(
    test_paths,
    is_test=True,
    augmentation=transforms,
)

test_loader = DataLoader(
    test_dataset,
    batch_size=config["batch_size"],
    shuffle=False,
    num_workers=config["num_workers"],
    drop_last=False,
)

In [None]:
result = predict_fn(test_loader, model, config["device"])

In [None]:
submission["label"] = result
submission.to_csv("submission.csv", index=None)

In [None]:
submission.head()