In [1]:
import os
import numpy as np
import librosa
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
SAMPLE_RATE = 16000
N_MFCC = 40
DATA_DIR = "/content/drive/MyDrive/AudioDataset"
SEGMENT_DURATION = 2.0  # seconds
AUGMENT_TIMES = 2


In [12]:
def split_audio(audio, sr=SAMPLE_RATE, segment_duration=SEGMENT_DURATION):
    """Split audio into fixed-length segments (in seconds)."""
    segment_length = int(segment_duration * sr)
    segments = []
    for start in range(0, len(audio), segment_length):
        segment = audio[start:start+segment_length]
        if len(segment) < 128:  # skip extremely short segments
            continue
        segments.append(segment)
    return segments


In [11]:
# MFCC extraction (safe)
def extract_mfcc(audio, sr=SAMPLE_RATE, n_mfcc=N_MFCC):
    if audio is None or len(audio) == 0:
        return None
    audio = np.nan_to_num(audio)  # replace NaNs/infs with 0
    max_val = np.max(np.abs(audio))
    if max_val > 0:
        audio = audio / max_val
    if len(audio) < 16:
        return None
    n_fft = 512 if len(audio) < 2048 else 2048
    try:
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft)
        mfcc_mean = np.mean(mfcc, axis=1)
        return mfcc_mean
    except:
        return None

# Augmentation functions
def add_noise_audio(audio, noise_level=0.005):
    noise = np.random.randn(len(audio))
    return audio + noise_level * noise

def change_speed_audio(audio, speed_factor=1.0):
    if speed_factor == 1.0:
        return audio
    try:
        return librosa.effects.time_stretch(audio, speed_factor)
    except:
        return audio

def pitch_shift_audio(audio, sr=SAMPLE_RATE, n_steps=0):
    try:
        return librosa.effects.pitch_shift(audio, sr=sr, n_steps=n_steps)
    except:
        return audio

def augment_audio(audio):
    aug_audio = audio.copy()
    choice = random.choice([0,1,2])
    if choice == 0:
        aug_audio = add_noise_audio(aug_audio)
    elif choice == 1:
        speed_factor = random.uniform(0.9, 1.1)
        aug_audio = change_speed_audio(aug_audio, speed_factor)
    elif choice == 2:
        n_steps = random.randint(-2, 2)
        aug_audio = pitch_shift_audio(aug_audio, n_steps=n_steps)
    return aug_audio


In [13]:
X = []
y = []
class_map = {"Non_Violence": 0, "Violence": 1}

for label in ["Non_Violence", "Violence"]:
    folder = os.path.join(DATA_DIR, label)
    if not os.path.exists(folder):
        raise FileNotFoundError(f"Folder not found: {folder}")

    for file in os.listdir(folder):
        if file.endswith((".wav", ".m4a")):
            path = os.path.join(folder, file)
            try:
                audio, _ = librosa.load(path, sr=SAMPLE_RATE, mono=True)
            except:
                print(f"Failed to load {file}")
                continue
            segments = split_audio(audio)
            for seg in segments:
                # original segment
                mfcc = extract_mfcc(seg)
                if mfcc is not None:
                    X.append(mfcc)
                    y.append(class_map[label])
                # augmented segments
                for _ in range(AUGMENT_TIMES):
                    aug_seg = augment_audio(seg)
                    mfcc_aug = extract_mfcc(aug_seg)
                    if mfcc_aug is not None:
                        X.append(mfcc_aug)
                        y.append(class_map[label])

X = np.array(X)
y = np.array(y)
print("✅ Dataset loaded (segment-level MFCC)")
print("X shape:", X.shape)
print("y distribution:", np.bincount(y))




✅ Dataset loaded (segment-level MFCC)
X shape: (18939, 40)
y distribution: [13500  5439]


In [14]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In [15]:
clf = LogisticRegression(max_iter=1000, C=1.0, class_weight='balanced')
clf.fit(X_train_scaled, y_train)

# Save model and scaler
pickle.dump(clf, open("/content/drive/MyDrive/logreg_mfcc_seg_model.pkl", "wb"))
pickle.dump(scaler, open("/content/drive/MyDrive/scaler_mfcc_seg.pkl", "wb"))


In [16]:
def predict_file(file_path, threshold=0.5):
    try:
        audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
    except:
        return "Non_Violence", [0.0, 1.0]
    segments = split_audio(audio)
    segment_probs = []
    for seg in segments:
        features = extract_mfcc(seg)
        if features is None:
            continue
        features_scaled = scaler.transform([features])
        prob = clf.predict_proba(features_scaled)[0][1]  # Violence probability
        segment_probs.append(prob)
    if not segment_probs:
        return "Non_Violence", [0.0, 1.0]
    avg_prob = np.mean(segment_probs)
    label = "Violence" if avg_prob > threshold else "Non_Violence"
    return label, [1-avg_prob, avg_prob]


In [17]:
folder_to_predict = "/content/drive/MyDrive/test_audio"
for file in os.listdir(folder_to_predict):
    if file.endswith((".wav", ".m4a")):
        path = os.path.join(folder_to_predict, file)
        label, prob = predict_file(path)
        print(f"{file} -> {label}, probabilities: {prob}")


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


calm.m4a -> Violence, probabilities: [np.float64(0.2897911687128555), np.float64(0.7102088312871445)]


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Calm1.m4a -> Non_Violence, probabilities: [np.float64(0.7934520001427101), np.float64(0.2065479998572899)]


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Calm6.m4a -> Violence, probabilities: [np.float64(0.06751075785964578), np.float64(0.9324892421403542)]


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Calm 2.m4a -> Non_Violence, probabilities: [np.float64(0.865199341740329), np.float64(0.13480065825967108)]


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Calm4.m4a -> Non_Violence, probabilities: [np.float64(0.9134990530003275), np.float64(0.08650094699967244)]


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Calm5.m4a -> Non_Violence, probabilities: [np.float64(0.6541923658713182), np.float64(0.3458076341286817)]


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Calm 3.m4a -> Non_Violence, probabilities: [np.float64(0.6059026903578686), np.float64(0.39409730964213135)]


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Viol5.m4a -> Non_Violence, probabilities: [np.float64(0.8020390312491058), np.float64(0.19796096875089422)]


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Viol1.m4a -> Non_Violence, probabilities: [np.float64(0.8242182404380788), np.float64(0.17578175956192124)]


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Viol3.m4a -> Violence, probabilities: [np.float64(0.0005849277794147678), np.float64(0.9994150722205852)]


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Viol4.m4a -> Violence, probabilities: [np.float64(0.0022102657752319788), np.float64(0.997789734224768)]
Viol2.m4a -> Violence, probabilities: [np.float64(0.2871361054917593), np.float64(0.7128638945082407)]


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [18]:
X_test_scaled = scaler.transform(X_test)  # already done in previous block


In [19]:
y_pred = clf.predict(X_test_scaled)
y_prob = clf.predict_proba(X_test_scaled)  # probabilities for each class


In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("✅ Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Non_Violence","Violence"]))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Test Accuracy: 0.961281239000352

Classification Report:
               precision    recall  f1-score   support

Non_Violence       0.98      0.97      0.97      2025
    Violence       0.93      0.94      0.93       816

    accuracy                           0.96      2841
   macro avg       0.95      0.96      0.95      2841
weighted avg       0.96      0.96      0.96      2841


Confusion Matrix:
 [[1963   62]
 [  48  768]]


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# If clf is already trained and scaler exists
pipeline = Pipeline([
    ("scaler", scaler),   # scales the MFCC features
    ("classifier", clf)   # logistic regression model
])


In [22]:
import pickle

with open("/content/drive/MyDrive/audio_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("✅ Pipeline saved")


✅ Pipeline saved
