In [108]:
# The goal is to detect fraudulent internships based on whether they meet specific conditions 
# (e.g., stipend presence, valid company name, reasonable duration). 
# If an internship fails to meet these, it's classified as fraud.

# Data Collection & Pre-processing

In [109]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [110]:
data = pd.read_csv('C:/Users/KIIT0001/Desktop/MAHASWETA/JUPYTER NOTEBOOK/Internshala.csv')
data.head(3)

Unnamed: 0,Title,Company Name,Internship Type,Duration,Stipend,Farud or not
0,Web Development,Computer Science Internship in Delhi,Bhopal,1 Month,"2,000 - 10,000 /month",0
1,PHP Development,Internships in Delhi,"Thane, Navi Mumbai",6 Months,"12,000 - 18,000 /month",0
2,iOS App Development,Computer Science Jobs in Delhi,Ahmedabad,3 Months,"5,000 - 12,000 /month",0


In [111]:
data.shape

(40, 6)

In [112]:
data.describe()

Unnamed: 0,Farud or not
count,40.0
mean,0.1
std,0.303822
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [113]:
# checking for missing values
data.isnull().sum()

Title              0
Company Name       0
Internship Type    0
Duration           0
 Stipend           0
Farud or not       0
dtype: int64

In [114]:
# Handling missing values
data.fillna("Unknown", inplace=True)

# Feature Engineering

In [115]:
# checking for correct column names
data.rename(columns=lambda x: x.strip(), inplace=True)  # Removes extra spaces
print(data.columns)

Index(['Title', 'Company Name', 'Internship Type', 'Duration', 'Stipend',
       'Farud or not'],
      dtype='object')


In [124]:
import re

def convert_stipend(Stipend):
    numbers = re.findall(r'\d+', Stipend.replace(",", ""))  # Extract numbers
    return int(numbers[0]) if numbers else None  # Use first number

# Apply conversion
data["Converted Stipend"] = data["Stipend"].apply(convert_stipend)

data["Converted Stipend"] = data["Converted Stipend"].replace("Unpaid", 0)

In [129]:
print(data.columns)

Index(['Title', 'Company Name', 'Internship Type', 'Duration', 'Farud or not',
       'Converted Stipend'],
      dtype='object')


In [117]:
# distribution of legit transactions & fraud internships
# 0 -> legit
# 1 -> fraud

data['Farud or not'].value_counts()

Farud or not
0    36
1     4
Name: count, dtype: int64

In [136]:
 # Convert text categories into numbers
le = LabelEncoder()
for col in ["Title","Company Name", "Internship Type", "Duration","Converted Stipend"]:
    data[col] = le.fit_transform(data[col])
    
data = data.astype(float)  # Convert entire dataset to float 

In [143]:
data.head(6)

Unnamed: 0,Title,Company Name,Internship Type,Duration,Farud or not,Converted Stipend
0,26.0,3.0,7.0,0.0,0.0,1.0
1,15.0,9.0,5.0,4.0,0.0,11.0
2,31.0,4.0,0.0,2.0,0.0,5.0
3,28.0,10.0,6.0,2.0,1.0,9.0
4,9.0,2.0,6.0,2.0,1.0,9.0
5,30.0,15.0,6.0,2.0,1.0,9.0


# Splitting feature-target

In [137]:
X = data.drop(columns='Farud or not', axis=1)
Y = data['Farud or not']

print(X)
print(Y)

    Title  Company Name  Internship Type  Duration  Converted Stipend
0    26.0           3.0              7.0       0.0                1.0
1    15.0           9.0              5.0       4.0               11.0
2    31.0           4.0              0.0       2.0                5.0
3    28.0          10.0              6.0       2.0                9.0
4     9.0           2.0              6.0       2.0                9.0
5    30.0          15.0              6.0       2.0                9.0
6     5.0          15.0              6.0       0.0                9.0
7    24.0          15.0              6.0       2.0               12.0
8    12.0          15.0              6.0       3.0               10.0
9     2.0          15.0             11.0       4.0               14.0
10    7.0          11.0              6.0       2.0                8.0
11   10.0          19.0              6.0       2.0               13.0
12   18.0          16.0              6.0       0.0                0.0
13    0.0          1

# Splitting train-test data

In [138]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(40, 5) (32, 5) (8, 5)


# Training model

In [139]:
model = LogisticRegression()
model.fit(X_train, Y_train)

# Model evaluation - checking accuracy score

In [140]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.90625


In [141]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.875


# Predictive System

In [148]:
input_data = (9.0,2.0,6.0,2.0,9.0)

# changing the input data to numpy array
numpy_array_of_input_data = np.asarray(input_data)

# reshape the numpy array as we are predicting for one instance
input_data_reshaped = numpy_array_of_input_data.reshape(1,-1)

# make a prediction
prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0] == 1):
    print("Legitimate")
else:
    print("Fraud")

[0.]
Fraud


