In [18]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('CreditCardFraudEDA').getOrCreate()

fraudTrain = spark.read.csv('fraudTrain.csv', header=True, inferSchema=True)

fraudTrain.printSchema()
fraudTrain.show(5)

root
 |-- _c0: integer (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)

+---+---------------------+----------------+--------------------+-------------+------+---------+-------+--

In [6]:
fraudTrain.describe().show()

+-------+-----------------+--------------------+-------------------+-------------+-----------------+-------+-------+-------+--------------------+-------+-------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+------------------+------------------+--------------------+
|summary|              _c0|              cc_num|           merchant|     category|              amt|  first|   last| gender|              street|   city|  state|              zip|              lat|              long|         city_pop|               job|           trans_num|           unix_time|         merch_lat|        merch_long|            is_fraud|
+-------+-----------------+--------------------+-------------------+-------------+-----------------+-------+-------+-------+--------------------+-------+-------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+----------------

In [20]:
from pyspark.sql.types import NumericType

schema = fraudTrain.schema.fields

column_types = {
    "numeric": [],
    "others": []
}

for field in schema:
    if isinstance(field.dataType, NumericType):
        column_types["numeric"].append(field.name)
    else:
        column_types["others"].append(field.name)

print("Numeric : ",column_types["numeric"])
print("Others : ",column_types["others"])


Numeric :  ['_c0', 'cc_num', 'amt', 'zip', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud']
Others :  ['trans_date_trans_time', 'merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num']


In [7]:
from pyspark.sql.functions import col, count, when, round, mean, stddev, min, max, corr
fraudTrain.select([count(when(col(c).isNull(), c)).alias(c) for c in fraudTrain.columns]).show()

+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+
|_c0|trans_date_trans_time|cc_num|merchant|category|amt|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|trans_num|unix_time|merch_lat|merch_long|is_fraud|
+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+
|  0|                    0|     0|       0|       0|  0|    0|   0|     0|     0|   0|    0|  0|  0|   0|       0|  0|  0|        0|        0|        0|         0|       0|
+---+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+



In [8]:
from pyspark.sql.functions import col, unix_timestamp
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

# Drop unnecessary columns
drop_cols = ['_c0', 'cc_num', 'first', 'last', 'gender', 'street', 'city', 'state',
             'zip', 'job', 'dob', 'trans_num', 'merchant']
data = fraudTrain.drop(*drop_cols)

# Convert timestamp to numeric
data = data.withColumn("trans_date_ts", unix_timestamp("trans_date_trans_time"))
data = data.drop("trans_date_trans_time")

# Encode categorical columns
cat_col = "category"
indexer = StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}_index", handleInvalid='keep')

# Define feature columns
feature_cols = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'trans_date_ts', 'category_index']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Final dataset
final_data = indexer.fit(data).transform(data)
final_data = assembler.transform(final_data).select("features", "is_fraud")


In [9]:
final_data.printSchema()
final_data.show(5)

root
 |-- features: vector (nullable = true)
 |-- is_fraud: integer (nullable = true)

+--------------------+--------+
|            features|is_fraud|
+--------------------+--------+
|[4.97,36.0788,-81...|       0|
|[107.23,48.8878,-...|       0|
|[220.11,42.1808,-...|       0|
|[45.0,46.2306,-11...|       0|
|[41.96,38.4207,-7...|       0|
+--------------------+--------+
only showing top 5 rows



In [10]:
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="is_fraud", featuresCol="features")
lr_model = lr.fit(train_data)
lr_preds = lr_model.transform(test_data)


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "c:\Users\hp\AppData\Local\Programs\Python\Python310\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "c:\Users\hp\AppData\Local\Programs\Python\Python310\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "c:\Users\hp\AppData\Local\Programs\Python\Python310\lib\socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [12]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="is_fraud", featuresCol="features", numTrees=50)
rf_model = rf.fit(train_data)
rf_preds = rf_model.transform(test_data)


In [13]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol="is_fraud", featuresCol="features", maxIter=20)
gbt_model = gbt.fit(train_data)
gbt_preds = gbt_model.transform(test_data)


In [14]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="is_fraud")

print("LogReg AUC:", evaluator.evaluate(lr_preds))
print("Random Forest AUC:", evaluator.evaluate(rf_preds))
print("GBT AUC:", evaluator.evaluate(gbt_preds))


LogReg AUC: 0.8636096114250789
Random Forest AUC: 0.9364372116793445
GBT AUC: 0.9867289607308676


In [None]:
# user_input = {
#     'amt': 100.0,
#     'lat': 37.7749,
#     'long': -122.4194,
#     'city_pop': 50000,
#     'unix_time': 1325376018,
#     'merch_lat': 37.0,
#     'merch_long': -122.0,
#     'trans_date_ts': 1577836800,
#     'category': 'misc_pos'
# }


In [22]:
from pyspark.sql import Row

# Example user input
user_input = {
    'amt': 100.0,
    'lat': 37.7749,
    'long': -122.4194,
    'city_pop': 50000,
    'unix_time': 1325376018,
    'merch_lat': 37.0,
    'merch_long': -122.0,
    'trans_date_ts': 1577836800,
    'category': 'misc_pos'
}

# Create DataFrame from user input
input_df = spark.createDataFrame([Row(**user_input)])

# Apply the saved indexer model
input_df = indexer_model.transform(input_df)

# Assemble features
input_df = assembler.transform(input_df)

# Predict using trained model (e.g., Random Forest)
prediction = rf_model.transform(input_df).select("prediction", "probability")
prediction.show()


NameError: name 'indexer_model' is not defined

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import NumericType
from pyspark.sql.functions import col, count, when, unix_timestamp
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import Row

# 1. Spark session and data loading
spark = SparkSession.builder.appName('CreditCardFraudEDA').getOrCreate()
fraudTrain = spark.read.csv('fraudTrain.csv', header=True, inferSchema=True)

# 2. Print schema and basic info
fraudTrain.printSchema()
fraudTrain.show(5)
fraudTrain.describe().show()

# 3. Identify numeric columns
schema = fraudTrain.schema.fields
column_types = {"numeric": [], "others": []}
for field in schema:
    if isinstance(field.dataType, NumericType):
        column_types["numeric"].append(field.name)
    else:
        column_types["others"].append(field.name)

print("Numeric : ", column_types["numeric"])
print("Others : ", column_types["others"])

# 4. Check for nulls
fraudTrain.select([count(when(col(c).isNull(), c)).alias(c) for c in fraudTrain.columns]).show()

# 5. Drop unneeded columns
drop_cols = ['_c0', 'cc_num', 'first', 'last', 'gender', 'street', 'city', 'state',
             'zip', 'job', 'dob', 'trans_num', 'merchant']
data = fraudTrain.drop(*drop_cols)

# 6. Convert timestamp to numeric
data = data.withColumn("trans_date_ts", unix_timestamp("trans_date_trans_time")).drop("trans_date_trans_time")

# 7. Encode category
cat_col = "category"
indexer = StringIndexer(inputCol=cat_col, outputCol="category_index", handleInvalid='keep')
indexer_model = indexer.fit(data)
data_indexed = indexer_model.transform(data)

# 8. Assemble features
feature_cols = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'trans_date_ts', 'category_index']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
final_data = assembler.transform(data_indexed).select("features", "is_fraud")

# 9. Train-test split
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

# 10. Train models
lr = LogisticRegression(labelCol="is_fraud", featuresCol="features")
lr_model = lr.fit(train_data)
lr_preds = lr_model.transform(test_data)

rf = RandomForestClassifier(labelCol="is_fraud", featuresCol="features", numTrees=50)
rf_model = rf.fit(train_data)
rf_preds = rf_model.transform(test_data)

gbt = GBTClassifier(labelCol="is_fraud", featuresCol="features", maxIter=20)
gbt_model = gbt.fit(train_data)
gbt_preds = gbt_model.transform(test_data)

# 11. Evaluation
evaluator = BinaryClassificationEvaluator(labelCol="is_fraud")
print("LogReg AUC:", evaluator.evaluate(lr_preds))
print("Random Forest AUC:", evaluator.evaluate(rf_preds))
print("GBT AUC:", evaluator.evaluate(gbt_preds))

# 12. User input prediction
user_input = {
    'amt': 100.0,
    'lat': 37.7749,
    'long': -122.4194,
    'city_pop': 50000,
    'unix_time': 1325376018,
    'merch_lat': 37.0,
    'merch_long': -122.0,
    'trans_date_ts': 1577836800,
    'category': 'misc_pos'
}

input_df = spark.createDataFrame([Row(**user_input)])
input_df = indexer_model.transform(input_df)
input_df = assembler.transform(input_df)
prediction = rf_model.transform(input_df).select("prediction", "probability")
prediction.show()


root
 |-- _c0: integer (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)

+---+---------------------+----------------+--------------------+-------------+------+---------+-------+--