In [None]:
#Example1: check whether the data follows a normal distribution while considering that most clicks occur during the day and fewer at night

#It is assumed that each publisher's data is according to its time zone.
#you can perform a normality test and visually inspect the data using a histogram.
#It's important to keep in mind that click data might not follow a strict normal distribution due to diurnal patterns.

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import your_database_library  # Import database library (e.g., psycopg2, pymysql)

# Define SQL query
query = """
SELECT
    publisher_id,
    EXTRACT(HOUR FROM click_timestamp) AS hour_of_day,
    COUNT(*) AS clicks_per_hour
FROM
    your_clicks_table
GROUP BY
    publisher_id,
    hour_of_day
ORDER BY
    publisher_id,
    hour_of_day;
"""

# Database connection parameters
db_params = {
    'host': 'your_host',
    'user': 'your_user',
    'password': 'your_password',
    'database': 'your_database'
}

# Establish a database connection
try:
    connection = your_database_library.connect(**db_params)

    # Execute the SQL query to get the data
    with connection.cursor() as cursor:
        cursor.execute(query)
        result = cursor.fetchall()

    # Extract the 'clicks_per_hour' data into a NumPy array
    clicks_data = np.array([row[2] for row in result])

    # Perform the Shapiro-Wilk normality test
    statistic, p_value = stats.shapiro(clicks_data)

    # Create a histogram to visualize the distribution
    plt.hist(clicks_data, bins=20, density=True, alpha=0.6, color='b', label='Clicks per Hour')
    plt.xlabel('Number of Clicks')
    plt.ylabel('Frequency')
    plt.title('Clicks Distribution')

    # Optionally, overlay a normal distribution curve for comparison
    mean, std_dev = np.mean(clicks_data), np.std(clicks_data)
    x = np.linspace(min(clicks_data), max(clicks_data), 100)
    pdf = stats.norm.pdf(x, mean, std_dev)
    plt.plot(x, pdf, 'k--', label='Normal Distribution')

    plt.legend()
    plt.show()

    # Check if the data follows a normal distribution
    alpha = 0.05  # Set your desired significance level
    if p_value > alpha:
        print("The data follows a normal distribution.")
    else:
        print("The data does not follow a normal distribution")

finally:
    if connection:
        connection.close()


In [None]:
#Example2: check whether we can find mulltiple clicks that occurred on the previous day identify by the same IP address and publisher.


from collections import defaultdict
from datetime import datetime, timedelta

# Sample data with timestamp, IP addresses, and publisher_id
sample_data = [
    {"timestamp": "2023-10-25 14:30:00", "ip": "192.168.1.1", "publisher_id": 1},
    {"timestamp": "2023-10-25 14:45:00", "ip": "192.168.1.2", "publisher_id": 1},
    {"timestamp": "2023-10-25 15:00:00", "ip": "192.168.1.1", "publisher_id": 2},
    {"timestamp": "2023-10-25 15:15:00", "ip": "192.168.1.1", "publisher_id": 1},
    {"timestamp": "2023-10-25 15:30:00", "ip": "192.168.1.2", "publisher_id": 1},
    # Add more data entries here
]

# Calculate the date for yesterday
yesterday = datetime.now() - timedelta(days=1)
yesterday_str = yesterday.strftime("%Y-%m-%d")

# Filter the sample data for clicks from yesterday
yesterday_data = [entry for entry in sample_data if entry["timestamp"].startswith(yesterday_str)]

# Create a dictionary to track IP addresses and their associated publisher_ids for yesterday's data
ip_publisher_counts = defaultdict(list)

# Process the data for yesterday's clicks
for entry in yesterday_data:
    ip = entry["ip"]
    publisher_id = entry["publisher_id"]
    ip_publisher_counts[ip].append(publisher_id)

# Find IP addresses with multiple clicks on the same publisher_id for yesterday
for ip, publisher_ids in ip_publisher_counts.items():
    unique_publisher_ids = set(publisher_ids)
    if len(unique_publisher_ids) > 1:
        print(f"IP address {ip} clicked on multiple publisher_ids ({', '.join(map(str, unique_publisher_ids))}) yesterday")


In [None]:
#Example3: Get data from your vendor's API and calculate the fraud rate.

import requests
import pandas as pd

# Define the API endpoint URL and any required headers
api_url = "https://api.vendor.com/data"
headers = {
    "Authorization": "Bearer Your_API_Key"
}

try:
    # Make an HTTP GET request to the API
    response = requests.get(api_url, headers=headers)

    if response.status_code == 200:
        # Convert the API response JSON data into a DataFrame
        data = response.json()
        df = pd.DataFrame(data)

        # Group by publisher_id and sum the page views
        page_views_per_publisher = df.groupby('publisher_id')['page_views'].sum()

        # Calculate the fraud percentage per publisher_id
        fraud_percentage = (df.groupby('publisher_id')['fraud'].sum() / page_views_per_publisher) * 100

        # Print the results
        print("Page Views per Publisher:")
        print(page_views_per_publisher)

        print("\nFraud Percentage per Publisher:")
        print(fraud_percentage)

    else:
        print(f"Request failed with status code: {response.status_code}")

except requests.exceptions.RequestException as e:
    print(f"Request error: {e}")
