In [5]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [6]:
# Create SparkConf and SparkContext
conf = SparkConf().setAppName("projectName").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf)

In [7]:
# Initialize SparkSession
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [16]:
data = [
    {"Name": "Alice", "Age": 25, "City": "New York"},
    {"Name": "Bob", "Age": 30, "City": "San Francisco"},
    {"Name": "Charlie", "Age": 35, "City": "Los Angeles"},
    {"Name": "David", "Age": 40, "City": "Chicago"}
]

In [17]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("projectName").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf)

In [18]:
rdd = sc.parallelize(data)
type(rdd)

In [19]:
df = rdd.toDF()
type(df)

In [21]:
# Write the DataFrame to a parquet file, overwriting if the file exists.
file_path = "my_data.parquet"
df.write.format("parquet").mode("overwrite").save(file_path)

In [22]:
# Read the parquet file back into a DataFrame.
df_loaded = spark.read.format("parquet").load(file_path)
df_loaded.show()

+---+-------------+-------+
|Age|         City|   Name|
+---+-------------+-------+
| 35|  Los Angeles|Charlie|
| 40|      Chicago|  David|
| 25|     New York|  Alice|
| 30|San Francisco|    Bob|
+---+-------------+-------+



In [24]:
import pandas as pd

# Define data
[
    {"Name": "Alice", "Age": 25, "City": "New York"},
    {"Name": "Bob", "Age": 30, "City": "San Francisco"},
    {"Name": "Charlie", "Age": 35, "City": "Los Angeles"},
    {"Name": "David", "Age": 40, "City": "Chicago"}
]

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV file
df.to_csv("data.csv", index=False)

print("CSV file created successfully!")

CSV file created successfully!


In [26]:
df = spark.read.csv('data.csv')

In [28]:
import pandas as pd

# Sample data for multiple files
data1 = {"Name": ["Alice", "Bob"], "Age": [25, 30], "Salary": [50000, 60000]}
data2 = {"Name": ["Charlie", "David"], "Age": [35, 40], "Salary": [70000, 80000]}
data3 = {"Name": ["Eve", "Frank"], "Age": [45, 50], "Salary": [90000, 100000]}

# Convert to DataFrames
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
df3 = pd.DataFrame(data3)

# Save DataFrames as CSV files
df1.to_csv("file1.csv", index=False)
df2.to_csv("file2.csv", index=False)
df3.to_csv("file3.csv", index=False)

print("CSV files created successfully!")

CSV files created successfully!


In [30]:
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder.appName("Read_Multiple_CSVs").getOrCreate()

# List of CSV file paths
csv_files = ["file1.csv", "file2.csv", "file3.csv"]

# Read all files into a single DataFrame
df = spark.read.csv(csv_files, header=True, inferSchema=True)

# Show DataFrame
df.show()


+-------+---+------+
|   Name|Age|Salary|
+-------+---+------+
|Charlie| 35| 70000|
|  David| 40| 80000|
|    Eve| 45| 90000|
|  Frank| 50|100000|
|  Alice| 25| 50000|
|    Bob| 30| 60000|
+-------+---+------+



In [33]:
# Read multiple CSV files while keeping headers
df = spark.read.option("header", "true").csv(["file1.csv", "file2.csv", "file3.csv"])
df.show()


+-------+---+------+
|   Name|Age|Salary|
+-------+---+------+
|Charlie| 35| 70000|
|  David| 40| 80000|
|    Eve| 45| 90000|
|  Frank| 50|100000|
|  Alice| 25| 50000|
|    Bob| 30| 60000|
+-------+---+------+



In [34]:
df = spark.read.options(header=True, inferSchema=True).csv(["file1.csv", "file2.csv", "file3.csv"])
df.show()

+-------+---+------+
|   Name|Age|Salary|
+-------+---+------+
|Charlie| 35| 70000|
|  David| 40| 80000|
|    Eve| 45| 90000|
|  Frank| 50|100000|
|  Alice| 25| 50000|
|    Bob| 30| 60000|
+-------+---+------+



In [39]:
df = spark.read.options(header=True).csv(["file1.csv", "file2.csv", "file3.csv"])
df.show()

+-------+---+------+
|   Name|Age|Salary|
+-------+---+------+
|Charlie| 35| 70000|
|  David| 40| 80000|
|    Eve| 45| 90000|
|  Frank| 50|100000|
|  Alice| 25| 50000|
|    Bob| 30| 60000|
+-------+---+------+



In [61]:
# Create a dummy text file for demonstration
file_path = "dataspark.txt"
with open(file_path, "w") as f:
    f.write("Alice 34 Bob 45 Charlie 30.")

# Read the text file into a DataFrame
df = spark.read.text(file_path)

# Show the DataFrame content
df.show(truncate=False)
df = spark.read.text(file_path)

# Show the DataFrame content
df.show(truncate=False)

+---------------------------+
|value                      |
+---------------------------+
|Alice 34 Bob 45 Charlie 30.|
+---------------------------+

+---------------------------+
|value                      |
+---------------------------+
|Alice 34 Bob 45 Charlie 30.|
+---------------------------+



In [63]:
# Initialize Spark session
spark = SparkSession.builder.master("local").appName("CSV from Text File").getOrCreate()
df.show()

+--------------------+
|               value|
+--------------------+
|Alice 34 Bob 45 C...|
+--------------------+



In [81]:
import json

# Dummy data to be written to the JSON file
dummy_data = [
    [
    {
        "name": "Alice",
        "age": 34
    },
    {
        "name": "Bob",
        "age": 45
    },
    {
        "name": "Charlie",
        "age": 30
    }
]
]

# Write the dummy data to a JSON file
with open("dummy_data.json", "w") as json_file:
    json.dump(dummy_data, json_file, indent=4)

print("Dummy JSON file created successfully!")


Dummy JSON file created successfully!
