In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [None]:
df1 = pd.read_csv('new_File1.csv')
df2 = pd.read_csv('new_File2.csv')
df1

In [None]:
df2

In [None]:

dept_counts = df2["department"].value_counts()


plt.figure(figsize=(10,5))
sns.barplot(x=dept_counts.index, y=dept_counts.values, palette="coolwarm")
plt.xticks(rotation=45)
plt.xlabel("Skye8 Department")
plt.ylabel("Number of Interns")
plt.title("Number of Interns per Department")
plt.show()


In [None]:
dept_mapping = df2.groupby([df1["Gender"], df1["School"]]).size().unstack(fill_value=0)


print(dept_mapping.head())
plt.figure(figsize=(12, 6))
sns.heatmap(dept_mapping, cmap="Blues", annot=True, fmt="d")
plt.title("Gender vs Skye8 Department")
plt.show()


In [None]:

df1["Start Date"] = pd.to_datetime(df1["Start Date"])


start_counts = df1.groupby("Start Date").size().reset_index(name="Intern_Count")


same_start_dates = start_counts[start_counts["Intern_Count"] >= 1]
print(same_start_dates)





df1["End Date"] = pd.to_datetime(df1["End Date"])


end_counts = df1.groupby("End Date").size().reset_index(name="Intern_Count")


same_end_dates = end_counts[end_counts["Intern_Count"] >= 1]
print(same_end_dates)


In [None]:

plt.figure(figsize=(15, 5))


plt.subplot(1, 2, 1)
sns.barplot(
    data=same_start_dates,
    x='Start Date',
    y='Intern_Count',
    palette="coolwarm"
)
plt.xticks(rotation=45)
plt.xlabel("Start Date")
plt.ylabel("Number of Interns")
plt.title("Interns with Same Start Date")


plt.subplot(1, 2, 2)
sns.barplot(
    data=same_end_dates,
    x='End Date',
    y='Intern_Count',
    palette="coolwarm"
)
plt.xticks(rotation=45)
plt.xlabel("End Date")
plt.ylabel("Number of Interns")
plt.title("Interns with Same End Date")


plt.tight_layout()
plt.show()


In [None]:
grp_dpt = df1['Age'].groupby(df2['department'])
dept_mean =grp_dpt.mean()
dept_mean



In [None]:
plt.figure(figsize=(10,5))
sns.barplot( x= dept_mean.index, y=dept_mean.values, palette="coolwarm")
plt.xticks(rotation=45)
plt.xlabel("Skye8 Department")
plt.ylabel("Number of Interns")
plt.title("Number of Interns per Department")
plt.show()

In [None]:
grp_dpt = df1['Age'].groupby(df2['department'])
dept_mean =grp_dpt.mean()
dept_mean



In [None]:

plt.figure(figsize=(8, 8))
plt.pie(dept_mean.values, labels=dept_mean.index,
        autopct=lambda p: '{:.1f}'.format(p * sum(dept_mean.values) / 100),
        colors=sns.color_palette("coolwarm", len(dept_mean)))

plt.title("Mean Age Distribution per Department")

plt.show()

In [None]:
university_duration = df1.groupby('School')['Internship_Duration'].mean().sort_values()
university_duration

In [None]:
plt.figure(figsize=(10, 6))
plt.hlines(y=university_duration.index, xmin=0, xmax=university_duration.values, color='skyblue', linewidth=2)
plt.plot(university_duration.values, university_duration.index, "o", markersize=8, color='coral')
plt.xlabel("Average Internship Duration (Days)")
plt.ylabel("University")
plt.title("Average Internship Duration by University (Lollipop Chart)")
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:

df1["Start Date"] = pd.to_datetime(df1["Start Date"], errors="coerce")
df1["End Date"] = pd.to_datetime(df1["End Date"], errors="coerce")

In [None]:

df1["Internship_Duration"] = (df1["End Date"] - df1["Start Date"]).dt.days
df1

In [None]:
intern_distribution = df1.groupby(df1["Start Date"].dt.to_period("M")).size()
plt.figure(figsize=(10, 5))
intern_distribution.plot(kind='area', alpha=0.5, color='skyblue')
plt.title("Intern Distribution Over Time")
plt.xlabel("Month")
plt.ylabel("Number of Interns")
plt.xticks(rotation=45)
plt.grid()
plt.show()

In [None]:

top_universities = df1["School"].value_counts().head(5)
bars = top_universities.plot(kind="bar", color="skyblue")

for bar in bars.patches:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height,
             int(height), ha='center', va='bottom')
plt.title("Top 5 Universities Providing Interns")
plt.xlabel("University")
plt.ylabel("Number of Interns")
plt.show()

In [None]:
# Word Cloud for Internship Goals
text = " ".join(str(goal) for goal in df["Internship_Goals"].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Most Common Words in Internship Goals")
plt.show()