In [2]:
# Importing pandas library for data manipulation
import pandas as pd

# Reading the three datasets from CSV files
df1 = pd.read_csv('Downloads\\CleanedCountriesLiteracyRateRefined.csv')
df2 = pd.read_csv('Downloads\\GovernmentEducationSpendingRefined.csv')
df3 = pd.read_csv('Downloads\\HumanDevelopmentIndexRefined.csv')

In [3]:
# Renaming columns to have a consistent 'Country' column
df1 = df1.rename(columns={'Country': 'Country'})
df2 = df2.rename(columns={'Location': 'Country'})
df3 = df3.rename(columns={'Country': 'Country'})

# Merging all 3 DataFrames
merged_df = df1.merge(df2, on='Country', how='inner').merge(df3, on='Country', how='inner')

# Displaying the merged DataFrame
merged_df.head()

Unnamed: 0,Rank_x,Country,Average scale score,Change over 5 years,Percentage of GDP,Year,Rank_y,HDI Value,Annual Growth (2010-2021)
0,1,Singapore,587,11.0,2.9,2013,9,0.949,0.25
1,2,Ireland,577,10.0,3.7,2017,7,0.95,0.38
2,3,Hong Kong,573,4.0,3.3,2018,4,0.956,0.38
3,4,Russia,567,14.0,3.7,2020,56,0.821,0.25
4,7,Croatia,557,0.0,4.6,2013,39,0.878,0.53


In [4]:
# Displaying how many rows and columns in the new merged Data frame
merged_df.shape
# Small values eqautes to strict data regulations (Making sure the merged Data Frame to not include Nan values)

(37, 9)

In [5]:
# Sorting Dataframe by 'Average scale score' in ascending order
merged_df_asc = merged_df.sort_values(by='Average scale score', ascending=True)

# Displaying the Top 10 Countries (first few rows of the sorted DataFrames)
print("\nSorted in ascending order:\n")
print(merged_df_asc[['Country', 'Average scale score', 'Percentage of GDP', 'HDI Value']].head(10))


Sorted in ascending order:

         Country  Average scale score  Percentage of GDP  HDI Value
35  South Africa                  288                6.2      0.717
34       Morocco                  372                5.4      0.698
33         Egypt                  378                3.9      0.728
32        Jordan                  381                3.6      0.736
36  South Africa                  384                6.2      0.717
31          Iran                  413                4.0      0.780
30          Oman                  429                6.8      0.819
29    Uzbekistan                  437                6.3      0.727
28    Azerbaijan                  440                2.5      0.760
27       Bahrain                  458                2.3      0.888


In [6]:
# Sorting by 'Average scale score' in descending order
merged_df_desc = merged_df.sort_values(by='Average scale score', ascending=False)

# Displaying the Top 10 Countries (first few rows of the sorted DataFrames)
print("\nSorted in descending order:\n")
print(merged_df_desc[['Country', 'Average scale score', 'Percentage of GDP', 'HDI Value']].head(10))


Sorted in descending order:

           Country  Average scale score  Percentage of GDP  HDI Value
0        Singapore                  587                2.9      0.949
1          Ireland                  577                3.7      0.950
2        Hong Kong                  573                3.3      0.956
3           Russia                  567                3.7      0.821
4          Croatia                  557                4.6      0.878
5           Poland                  549                4.6      0.881
8          Hungary                  539                4.7      0.851
10          Norway                  539                8.0      0.966
7   Czech Republic                  539                5.6      0.895
6         Bulgaria                  539                4.1      0.799


In [7]:
# Executing summary statistics
summary_stats = merged_df.describe()

print("Summary Statistics:")
print(summary_stats)

# Calculating additional statistics
median_values = merged_df.median(numeric_only=True)
mode_values = merged_df.mode(numeric_only=True).iloc[0]

print("\nMedian Values:")
print(median_values)

print("\nMode Values:")
print(mode_values)

Summary Statistics:
          Rank_x  Average scale score  Change over 5 years  Percentage of GDP  \
count  37.000000            37.000000            37.000000          37.000000   
mean   29.000000           493.837838            12.864865           4.554054   
std    17.227239            68.105928            12.660442           1.658244   
min     0.000000           288.000000             0.000000           0.000000   
25%    17.000000           458.000000             2.000000           3.700000   
50%    29.000000           514.000000            11.000000           4.600000   
75%    43.000000           539.000000            18.000000           5.600000   
max    56.000000           587.000000            48.000000           8.000000   

              Year      Rank_y  HDI Value  Annual Growth (2010-2021)  
count    37.000000   37.000000  37.000000                  37.000000  
mean   1961.054054   47.945946   0.856378                   0.416216  
std     331.365353   34.349467   0.08

In [8]:
# Slice the DataFrame
sliced_df = merged_df.loc[5:15, ['Country', 'Average scale score', 'Percentage of GDP', 'HDI Value']]

# Display the sliced DataFrame
sliced_df

Unnamed: 0,Country,Average scale score,Percentage of GDP,HDI Value
5,Poland,549,4.6,0.881
6,Bulgaria,539,4.1,0.799
7,Czech Republic,539,5.6,0.895
8,Hungary,539,4.7,0.851
9,Denmark,539,7.6,0.952
10,Norway,539,8.0,0.966
11,Italy,537,3.8,0.906
12,Latvia,528,4.7,0.879
13,Netherlands,527,5.5,0.946
14,New Zealand,521,6.4,0.939


In [9]:
# a. Setting 'Country' column as the index
merged_df.set_index('Country', inplace=True)
print("\nDataFrame with 'Country' as index:\n")
print(merged_df.head())


DataFrame with 'Country' as index:

           Rank_x  Average scale score  Change over 5 years  \
Country                                                       
Singapore       1                  587                 11.0   
Ireland         2                  577                 10.0   
Hong Kong       3                  573                  4.0   
Russia          4                  567                 14.0   
Croatia         7                  557                  0.0   

           Percentage of GDP  Year  Rank_y  HDI Value  \
Country                                                 
Singapore                2.9  2013       9      0.949   
Ireland                  3.7  2017       7      0.950   
Hong Kong                3.3  2018       4      0.956   
Russia                   3.7  2020      56      0.821   
Croatia                  4.6  2013      39      0.878   

           Annual Growth (2010-2021)  
Country                               
Singapore                       0.25  
Irel

In [10]:
# b. Resetting the index to default integer-based index
merged_df.reset_index(inplace=True)
print("\nDataFrame with default integer index:\n")
print(merged_df.head())


DataFrame with default integer index:

     Country  Rank_x  Average scale score  Change over 5 years  \
0  Singapore       1                  587                 11.0   
1    Ireland       2                  577                 10.0   
2  Hong Kong       3                  573                  4.0   
3     Russia       4                  567                 14.0   
4    Croatia       7                  557                  0.0   

   Percentage of GDP  Year  Rank_y  HDI Value  Annual Growth (2010-2021)  
0                2.9  2013       9      0.949                       0.25  
1                3.7  2017       7      0.950                       0.38  
2                3.3  2018       4      0.956                       0.38  
3                3.7  2020      56      0.821                       0.25  
4                4.6  2013      39      0.878                       0.53  


In [11]:
# c. Creating a new DataFrame by selecting rows based on a conditional index
high_hdi_countries = merged_df[merged_df['HDI Value'] > 0.8]
print("\nCountries with HDI Value > 0.8:\n")
print(high_hdi_countries[['Country', 'HDI Value']].head())


Countries with HDI Value > 0.8:

     Country  HDI Value
0  Singapore      0.949
1    Ireland      0.950
2  Hong Kong      0.956
3     Russia      0.821
4    Croatia      0.878


In [12]:
# d. multi-level indexing by setting multiple columns as the index
merged_df.set_index(['Country', 'Year'], inplace=True)
print("\nDataFrame with multi-level index:\n")
print(merged_df.head())


DataFrame with multi-level index:

                Rank_x  Average scale score  Change over 5 years  \
Country   Year                                                     
Singapore 2013       1                  587                 11.0   
Ireland   2017       2                  577                 10.0   
Hong Kong 2018       3                  573                  4.0   
Russia    2020       4                  567                 14.0   
Croatia   2013       7                  557                  0.0   

                Percentage of GDP  Rank_y  HDI Value  \
Country   Year                                         
Singapore 2013                2.9       9      0.949   
Ireland   2017                3.7       7      0.950   
Hong Kong 2018                3.3       4      0.956   
Russia    2020                3.7      56      0.821   
Croatia   2013                4.6      39      0.878   

                Annual Growth (2010-2021)  
Country   Year                             
Singap

In [13]:
from pymongo import MongoClient


# Step 1: Establishing a connection to MongoDB
client = MongoClient('mongodb://localhost:27017/')  

# Step 2: Specifying the database and collection
db = client['education_database']  
collection = db['country_stats']   

# Step 3: Converting DataFrame to a list of dictionaries
data = merged_df.to_dict(orient='records')

# Step 4: Inserting data into the MongoDB collection
collection.insert_many(data)

# Step 5: Verifying the data insertion
document_count = collection.count_documents({})
print(f"Total records in the MongoDB collection: {document_count}")

# Step 6: Close the MongoDB connection
client.close()



Total records in the MongoDB collection: 74
