In [1]:
import pandas as pd
import json

# Load raw JSON
with open("faculty_raw.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)

print("Initial shape:", df.shape)
df.head()


Initial shape: (109, 7)


Unnamed: 0,faculty_type,name,education,phone,address,email,specialization
0,regular,Abhishek gupta,"PhD (Electrical and Computer Engineering), Tor...",079-68261598,"# 3208, FB-3, DAU, Gandhinagar, Gujarat, India...",abhishek_gupta[at]dau[dot]ac[dot]in,"Machine Learning, Statistical Signal Processin..."
1,regular,Abhishek jindal,"PhD (Electronics & Communication Engineering),...",079-68261654,"# 4101, FB-4, DA-IICT, Gandhinagar, Gujarat, I...",abhishek_jindal[at]dau[dot]ac[dot]in,"Reinforcement Learning, Deep Learning for Fina..."
2,regular,Abhishek tilva,"PhD (Statistics), Columbia University, New Yor...",079-68261549,"# 2209, FB-2, DA-IICT, Gandhinagar, Gujarat, I...",abhishek_tilva[at]dau[dot]ac[dot]in,"Arbitrage Theory, Stochastic Portfolio Theory,..."
3,regular,Aditya tatu,"PhD (Computer Science), University of Copenhag...",079-68261540,"#1206, FB-1, DA-IICT, Gandhinagar, Gujarat, In...",aditya_tatu[at]dau[dot]ac[dot]in,"Computer Vision, Image Processing, Pattern Rec..."
4,regular,Ajay beniwal,PhD (Electronics and Communication Engineering...,079-68261745,"# 3211, FB-3, DA-IICT, Gandhinagar, Gujarat, I...",ajay_beniwal[at]dau[dot]ac[dot]in,Flexible and Printable Electronics for Healthc...


In [2]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

df.columns


Index(['faculty_type', 'name', 'education', 'phone', 'address', 'email',
       'specialization'],
      dtype='object')

In [3]:
df = df.replace("", pd.NA)

df.isna().sum()


faculty_type       0
name               0
education          2
phone             32
address           35
email              1
specialization     3
dtype: int64

In [4]:
def clean_email(email):
    if pd.isna(email):
        return email
    return (
        email.replace("[at]", "@")
             .replace("[dot]", ".")
             .replace(" ", "")
             .lower()
    )

df["email"] = df["email"].apply(clean_email)


In [5]:
text_cols = [
    "name",
    "education",
    "specialization",
    "area_specialization_short",
    "biography"
]

for col in text_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace("nan", pd.NA)


In [6]:
df["faculty_type"] = df["faculty_type"].fillna("unknown")


In [7]:
before = len(df)

df = df.drop_duplicates(
    subset=["name", "faculty_type"],
    keep="first"
)

after = len(df)

print("Duplicates removed:", before - after)
print("Final records:", after)


Duplicates removed: 0
Final records: 109


In [8]:
df = df.sort_values(by=["faculty_type", "name"], na_position="last")


In [9]:
df.to_json(
    "faculty_clean.json",
    orient="records",
    indent=2,
    force_ascii=False
)

print("✅ faculty_clean.json created")


✅ faculty_clean.json created


In [10]:
print("Final dataset shape:", df.shape)
df["faculty_type"].value_counts()


Final dataset shape: (109, 7)


faculty_type
regular                  67
adjunct                  26
international_adjunct    11
professor_of_practice     4
distinguished             1
Name: count, dtype: int64