In [None]:
#filtering organisation names with the strings 'nsf' or 'national science foundation' from sample.csv

import pandas as pd

def filter_non_matching_organisations(csv_file):
    df = pd.read_csv(csv_file)

    non_matching_organisations = df[df['Organisation'].str.contains('nsf|national science foundation', case=False, na=False)]

    return non_matching_organisations

# Example usage:
csv_file_path = '/content/drive/MyDrive/Independent Study/sample.csv'  # Replace with the actual file path
print(pd.read_csv(csv_file_path).shape)
result_nsf_df = filter_non_matching_organisations(csv_file_path)

print("Organizations matching the criteria:")
print(result_nsf_df)

(10000, 1)
Organizations matching the criteria:
                                           Organisation
15                                             NSF-GFRP
21                                              NSF QIS
24        National Science Foundation under IGERT award
54                                             NSF/NNIN
55    National Science Foundation Dimensions of Biod...
...                                                 ...
9897                                   NSF through CBET
9909  Nano/Bio Interface Center at the University of...
9943  National Science Foundation (NSF) Division of ...
9967            National Science Foundation (NSF) [DEB]
9975                                         US NSF-EAR

[1163 rows x 1 columns]


In [None]:
# Find and count duplicates in a column
duplicates_count = result_nsf_df['Organisation'].duplicated(keep=False)
duplicates_df = result_nsf_df[duplicates_count]

# Count occurrences of each duplicated value
value_counts = duplicates_df['Organisation'].value_counts()

# Display the duplicates and their counts
print("Duplicates and their Counts:")
print(value_counts)

Duplicates and their Counts:
National Science Foundation's BRIDGE                                                              2
U.S. National Science Foundation within the Directorate for Biological Sciences                   2
US NSF-CNH Program                                                                                2
National Science Foundation Industry/University Cooperative Research Centers program              2
NSF CNS Grant                                                                                     2
NSF Partnerships for Innovation: Building Innovation Capacity (PFI:BIC) Program                   2
National Science Foundation Historically Black Colleges and Universities Undergraduate Program    2
National Science Foundation (NSF)/INSPIRE grant                                                   2
United States National Science Foundation Office of Polar Programs                                2
NSF Office of Polar Programs                                           

In [None]:
#removing duplicates
unique_nsf_df = result_nsf_df.drop_duplicates(subset='Organisation', keep='first')

unique_nsf_df = unique_nsf_df.rename(columns={'Organisation': 'UniqueOrganisation'})
print(unique_nsf_df)

                                     UniqueOrganisation
15                                             NSF-GFRP
21                                              NSF QIS
24        National Science Foundation under IGERT award
54                                             NSF/NNIN
55    National Science Foundation Dimensions of Biod...
...                                                 ...
9897                                   NSF through CBET
9909  Nano/Bio Interface Center at the University of...
9943  National Science Foundation (NSF) Division of ...
9967            National Science Foundation (NSF) [DEB]
9975                                         US NSF-EAR

[1153 rows x 1 columns]


In [None]:
nsf_df = pd.read_csv('/content/drive/MyDrive/Independent Study/nsf_sample.csv')

# Check if all values from nsf_df (1000) are in unique_nsf_df (1153)
all_values_in_column2 = nsf_df['Organisation'].isin(unique_nsf_df['UniqueOrganisation']).all()

if all_values_in_column2:
    print("All values from nsf_df (1000) are in unique_nsf_df (1153)")

#All values from nsf_df (1000) are in unique_nsf_df (1153) - the string filter works.

All values from nsf_df (1000) are in unique_nsf_df (1153)


In [None]:
#print values that are in unique_nsf_df but not in nsf_df

unique_values_in_df1 = unique_nsf_df.loc[~unique_nsf_df['UniqueOrganisation'].isin(nsf_df['Organisation']), 'UniqueOrganisation'].unique()

# Print or use the unique values
print("Values in unique_nsf_df but not in nsf_df:", unique_values_in_df1)

Values in unique_nsf_df but not in nsf_df: ['National Science Foundation Dimensions of Biodiversity grant' 'USA: NSF'
 'University National Science Foundation of Jiangsu Province of China'
 'NSF Instrumentation Facilities'
 'Department of Veterans Affairs, Office of Patient-Centered Care and Cultural Transformation'
 'NSFC for Excellent Youth Scholars of China'
 "Chinese National Science Foundation's"
 'Advanced Catalytic Transformation program for Carbon utilization (ACT-C) project of the Japan Science and Technology Agency (JST)'
 'National Science Foundation postdoctoral fellowship'
 'UMass Center for Hierarchical Manufacturing (CHM), a NSF Nanoscale Science and Engineering Center'
 'NSF Assembling the Tree of Life (AToL)'
 'NSF Science and Technology Center-Materials and Devices for Information Technology'
 'Adaptable and Seamless Technology Transfer Program'
 'Belgian National Science Foundation'
 'U.S. National Science Foundation Climate Dynamics Program'
 'International Cooperat

In [None]:
# Concatenate the two columns lengthwise into one column
merged_column = pd.concat([nsf_df['Organisation'], unique_nsf_df['UniqueOrganisation']])

# Create a DataFrame with unique values
unique_values_df = pd.DataFrame({'UniqueValues': merged_column.unique()})

# Save the unique values to a CSV file
unique_values_df.to_csv('/content/drive/MyDrive/Independent Study/unique_values.csv', index=False)

In [None]:
# Define the data to filter from - gemini code
data = [
    "NUS Startup Grant",
    "European Space Agency",
    "Outstanding Talent Cultivation Project of Liaoning Province",
    "Tsinghua Independent Research Grant",
    "Serpentes Foundation",
    "University of New South Wales ECR",
    "Basic Science Research Program through the National Research Foundation of Korea - Ministry of Education, Science, and Technology of Korea",
    "USA: NSF",
    "Chongqing Key Laboratory of Birth Defects and Reproductive Health",
    "US National Science Foundation (EFRI-SEED Award)",
    "NSF/CSEDI",
    "NSF-ARI Program",
    "University National Science Foundation of Jiangsu Province of China",
    "Commission of the European",
    "Centro Fermi-Museo Storico della Fisica e Centro Studi e Ricerche \"Enrico Fermi\", Italy",
    "Bristol-Myers/Squibb",
    "Aspen Center for Physics (NSF)",
    "Japan Marine Science and Technology Center",
    "Research Program on Climate Change Adaptation (RECCA)",
    "Bavarian State Ministry of Sciences, Research",
    "Brink/McLean Grassland Conservation Fund",
    "National Science and Technology Major Project of Twelfth Five Years",
    "Clinical Research Unit",
    "EquipEx PETAL+",
    "National Science Foundation's BRIDGE",
    "NSF-EF",
    "Center of Excellence on Food Agricultural Machinery (Kasetsart University)",
    "Secretaria de Ciencia y Tecnica de la Universidad Nacional de Rio Cuarto y de la Universidad Nacional de Cordoba y Ministerio de Ciencia y Tecnica de la Provincia de Cordoba",
    "Pfizer Inc. (New York, NY)",
    "Max Planck Graduate Center (MPGC) Mainz",
    "Strategic Priority Research Program of the Shandong Academy of Sciences",
    "NIH Bench-to-Bedside award",
    "China National Science Fundation",
    "Exploration-Oriented Key Scientific and Technological Innovation Project from Ministry of Education of China",
    "Stroke fund",
    "Direccion de Investigacion, Universidad de Los Lagos",
    "National Institutes of Neurological Diseases",
    "City of Gdynia",
    "Harvard Skin Diseases Pilot Study Grant",
    "Fondo Integrativo Speciale per la Ricerca",
    "Xinjiang Uygur Autonomous Region regional coordinated innovation project (Shanghai cooperation organization science and technology partnership program)",
    "National Science Foundation -Earth Sciences",
    "EPSRC/InnovateUK",
    "Natural Science Foundation of the Education Bureau of Anhui Province",
    "BMRC-SERC Joint Grant",
    "Queensland Parks and Wildlife Service",
    "HIPC",
    "National Institute on Alcohol Abuse and Alcoholism of the United States",
    "Becas-CONICYT Programme",
    "US National Science Foundation through the International Arctic Research Center",
    "NSF/DMR"
]

# Define the search terms (case-insensitive)
search_terms = ["nsf", "national science foundation"]

# Initialize an empty list to store the matched organizations
matched_organizations = []

# Iterate over each organization name in the data
for organization in data:
    # Check if any of the search terms are found in the organization name (case-insensitive)
    for term in search_terms:
        if term.lower() in organization.lower():
            # If a match is found, add the organization name and reason to the list
            matched_organizations.append(
                f"{organization} (Reason: contains the string '{term}')"
            )
            break  # Stop iterating through search terms if a match is found

# Print the list of matched organizations
if matched_organizations:
    print("Matched organizations affiliated with the National Science Foundation:")
    for organization in matched_organizations:
        print(organization)
else:
    print("No organizations affiliated with the National Science Foundation were found.")

Matched organizations affiliated with the National Science Foundation:
USA: NSF (Reason: contains the string 'nsf')
US National Science Foundation (EFRI-SEED Award) (Reason: contains the string 'national science foundation')
NSF/CSEDI (Reason: contains the string 'nsf')
NSF-ARI Program (Reason: contains the string 'nsf')
University National Science Foundation of Jiangsu Province of China (Reason: contains the string 'national science foundation')
Aspen Center for Physics (NSF) (Reason: contains the string 'nsf')
National Science Foundation's BRIDGE (Reason: contains the string 'national science foundation')
NSF-EF (Reason: contains the string 'nsf')
National Science Foundation -Earth Sciences (Reason: contains the string 'national science foundation')
US National Science Foundation through the International Arctic Research Center (Reason: contains the string 'national science foundation')
NSF/DMR (Reason: contains the string 'nsf')


In [None]:
USA: NSF	Explicitly mentions NSF in its name.
US National Science Foundation (EFRI-SEED Award)	Funded by or affiliated with NSF based on known funding programs or collaborations.
NSF/CSEDI	Explicitly mentions NSF in its name.
NSF-ARI Program	Explicitly mentions NSF in its name.
University National Science Foundation of Jiangsu Province of China	Funded by or affiliated with NSF based on known funding programs or collaborations.
National Science Foundation's BRIDGE	Funded by or affiliated with NSF based on known funding programs or collaborations.
NSF-EF	Explicitly mentions NSF in its name.
National Science Foundation -Earth Sciences	Funded by or affiliated with NSF based on known funding programs or collaborations.
EPSRC/InnovateUK	Joint program funded by EPSRC (UK) and Innovate UK, which collaborate with NSF on various research initiatives.
US National Science Foundation through the International Arctic Research Center	Funded by or affiliated with NSF based on known funding programs or collaborations.
NSF/DMR	Explicitly mentions NSF in its name.