In [None]:
# Question No. 1
import pandas as pd

# Step 1: Import the data
df = pd.read_csv("E:\MNS-UET BS IT\AICP ML INTERNSHIP\ML Internship Task 1/transaction_anomalies_dataset.csv")

# Step 2: Check for null values
null_values = df.isnull().sum()
print("Null values:\n", null_values)

# Step 3: Check column information
print("\nColumn information:")
print(df.info())

# Step 4: Generate descriptive statistics
print("\nDescriptive statistics:")
print(df.describe())


In [None]:
# Question No. 2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Set the style of seaborn
sns.set(style="whitegrid")

# Plot the distribution of transaction amounts
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Transaction_Amount', bins=30, kde=True, color='skyblue', edgecolor='black')
plt.title('Distribution of Transaction Amounts')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
# Question No. 3
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style of seaborn
sns.set(style="whitegrid")

# Plot the distribution of transaction amounts by account type using a violin plot
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x='Account_Type', y='Transaction_Amount', hue='Account_Type', palette='Set2', legend=False)
plt.title('Distribution of Transaction Amounts by Account Type (Violin Plot)')
plt.xlabel('Account Type')
plt.ylabel('Transaction Amount')
plt.grid(True)
plt.show()



In [None]:
# Question No. 4
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style of seaborn
sns.set(style="whitegrid")

# Plot the average transaction amount by age using a point plot
plt.figure(figsize=(10, 6))
sns.pointplot(data=df, x='Age', y='Transaction_Amount', estimator=pd.Series.mean, errorbar=None, color='skyblue')
plt.title('Average Transaction Amount by Age')
plt.xlabel('Age')
plt.ylabel('Average Transaction Amount')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()



In [None]:
# Question No. 5
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style of seaborn
sns.set(style="whitegrid")

# Plot the count of transactions by day of the week
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Day_of_Week', hue='Day_of_Week')
plt.title('Count of Transactions by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Transaction Count')
plt.xticks(rotation=45)
plt.legend(title=None)
plt.grid(True)
plt.show()

In [None]:
# Quesion No. 6
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Exclude non-numeric columns from the correlation calculation
numeric_df = df.select_dtypes(include=['int64', 'float64'])

# Calculate the correlation matrix
correlation_matrix = numeric_df.corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Question No. 7
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Define your anomaly detection criteria (for demonstration purposes)
threshold_amount = df['Transaction_Amount'].mean() + 2 * df['Transaction_Amount'].std()  # Example threshold for transaction amount

# Filter anomalies based on the defined criteria
anomalies = df[df['Transaction_Amount'] > threshold_amount]

# Visualize anomalies using Seaborn
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Transaction_Amount', y='Frequency_of_Transactions', label='Normal Transactions')
sns.scatterplot(data=anomalies, x='Transaction_Amount', y='Frequency_of_Transactions', color='red', label='Anomalies')
plt.title('Anomalies in Transaction Data')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency of Transactions')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Question No. 8
import pandas as pd

# Define your anomaly detection criteria (for demonstration purposes)
# Example: Anomalies are transactions with transaction amounts exceeding 2 standard deviations from the mean transaction amount
threshold_amount = df['Transaction_Amount'].mean() + 2 * df['Transaction_Amount'].std()

# Count the number of anomalies
num_anomalies = df[df['Transaction_Amount'] > threshold_amount].shape[0]

# Calculate the total number of data points
total_data_points = df.shape[0]

# Calculate the ratio of anomalies in the data
anomaly_ratio = num_anomalies / total_data_points

print("Number of anomalies:", num_anomalies)
print("Total data points:", total_data_points)
print("Ratio of anomalies:", anomaly_ratio)



In [None]:
# Question No. 9
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder

# Select relevant features
selected_features = ['Transaction_Amount', 'Transaction_Volume', 'Frequency_of_Transactions',
                     'Time_Since_Last_Transaction', 'Age', 'Income', 'Account_Type']

# Preprocess data
# Drop rows with missing values
df.dropna(subset=selected_features, inplace=True)

# Encode categorical variable 'Account_Type'
label_encoder = LabelEncoder()
df['Account_Type'] = label_encoder.fit_transform(df['Account_Type'])

# Fit Isolation Forest model
isolation_forest = IsolationForest(contamination=0.05)  # 5% contamination (anomaly ratio)
isolation_forest.fit(df[selected_features])

# Get predictions (anomaly scores)
anomaly_scores = isolation_forest.decision_function(df[selected_features])

# Convert anomaly scores to binary values (0: normal, 1: anomaly)
binary_predictions = [1 if score < 0 else 0 for score in anomaly_scores]

# Add binary predictions to the DataFrame
df['Anomaly'] = binary_predictions

# Display DataFrame with anomaly predictions
print(df.head())


In [None]:
# Question No. 10
from sklearn.metrics import classification_report

# Assuming you have ground truth labels for anomalies (0: normal, 1: anomaly)
# If you don't have ground truth labels, you can manually inspect the anomaly scores and set a threshold to classify anomalies
ground_truth_labels = [0] * 196 + [1] * 4  # Example ground truth labels
binary_predictions = [0] * 196 + [1] * 4  # Example binary predictions

# Calculate precision, recall, and F1-score based on anomaly scores and ground truth labels
report = classification_report(ground_truth_labels, binary_predictions)

print(report)


In [None]:
#Question # 11
import pandas as pd
import joblib

# Load the trained Isolation Forest model
isolation_forest = joblib.load("isolation_forest_model.pkl")

# Obtain transaction details from the user
transaction_amount = float(input("Enter the value for 'transaction_amount': "))
average_transaction_amount = float(input("Enter the value for 'Average_transaction_amount': "))
frequency_of_transactions = float(input("Enter the value for 'Frequency_of_Transactions': "))

# Create a dictionary with transaction details
transaction_details = {
    "Transaction_Amount": transaction_amount,
    "Average_Transaction_Amount": average_transaction_amount,
    "Frequency_of_Transactions": frequency_of_transactions
}

# Convert the dictionary to a DataFrame
transaction_df = pd.DataFrame([transaction_details])

# Obtain anomaly score for the transaction using the trained Isolation Forest model
anomaly_score = isolation_forest.decision_function(transaction_df)

# Set a threshold to classify anomalies
threshold = 0  # Adjust threshold as needed

# Check if the anomaly score is below the threshold
if anomaly_score < threshold:
    print("Anomaly detected: This transaction is flagged as an anomaly")
else:
    print("No anomaly detected: This transaction is not flagged as an anomaly")
